Skip to content

Commit 86cf8a1

Browse files
committed
[LoopVectorize] Enable more early exit vectorisation tests
PR #112138 introduced initial support for dispatching to multiple exit blocks via split middle blocks. This patch fixes a few issues so that we can enable more tests to use the new enable-early-exit-vectorization flag. Fixes are: 1. The code to bail out for any loop live-out values happens too late. This is because collectUsersInExitBlocks ignores induction variables, which get dealt with in fixupIVUsers. I've moved the check much earlier in processLoop by looking for outside users of loop-defined values. 2. We shouldn't yet be interleaving when vectorising loops with uncountable early exits, since we've not added support for this yet. 3. Similarly, we also shouldn't be creating vector epilogues. 4. Similarly, we shouldn't enable tail-folding. 5. The existing implementation doesn't yet support loops that require scalar epilogues, although I plan to add that as part of PR #88385. 6. The new split middle blocks weren't being added to the parent loop.
1 parent e84566f commit 86cf8a1

8 files changed

+302
-81
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 69 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3016,6 +3016,22 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
30163016
PSE.getSE()->forgetLoop(OrigLoop);
30173017
PSE.getSE()->forgetBlockAndLoopDispositions();
30183018

3019+
// When dealing with uncountable early exits we create middle.split blocks
3020+
// between the vector loop region and the exit block. These blocks need
3021+
// adding to any outer loop.
3022+
VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3023+
Loop *OuterLoop = OrigLoop->getParentLoop();
3024+
if (Legal->hasUncountableEarlyExit() && OuterLoop) {
3025+
VPBasicBlock *MiddleVPBB = State.Plan->getMiddleBlock();
3026+
VPBlockBase *PredVPBB = MiddleVPBB->getSinglePredecessor();
3027+
while (PredVPBB && PredVPBB != VectorRegion) {
3028+
BasicBlock *MiddleSplitBB =
3029+
State.CFG.VPBB2IRBB[cast<VPBasicBlock>(PredVPBB)];
3030+
OuterLoop->addBasicBlockToLoop(MiddleSplitBB, *LI);
3031+
PredVPBB = PredVPBB->getSinglePredecessor();
3032+
}
3033+
}
3034+
30193035
// After vectorization, the exit blocks of the original loop will have
30203036
// additional predecessors. Invalidate SCEVs for the exit phis in case SE
30213037
// looked through single-entry phis.
@@ -3046,7 +3062,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
30463062
for (Instruction *PI : PredicatedInstructions)
30473063
sinkScalarOperands(&*PI);
30483064

3049-
VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
30503065
VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
30513066
BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
30523067

@@ -4123,7 +4138,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
41234138
// a bottom-test and a single exiting block. We'd have to handle the fact
41244139
// that not every instruction executes on the last iteration. This will
41254140
// require a lane mask which varies through the vector loop body. (TODO)
4126-
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4141+
if (Legal->hasUncountableEarlyExit() ||
4142+
TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
41274143
// If there was a tail-folding hint/switch, but we can't fold the tail by
41284144
// masking, fallback to a vectorization with a scalar epilogue.
41294145
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
@@ -4753,7 +4769,9 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
47534769
// Epilogue vectorization code has not been auditted to ensure it handles
47544770
// non-latch exits properly. It may be fine, but it needs auditted and
47554771
// tested.
4756-
if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4772+
// TODO: Add support for loops with an early exit.
4773+
if (Legal->hasUncountableEarlyExit() ||
4774+
OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
47574775
return false;
47584776

47594777
return true;
@@ -5001,6 +5019,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
50015019
if (!Legal->isSafeForAnyVectorWidth())
50025020
return 1;
50035021

5022+
// We don't attempt to perform interleaving for early exit loops.
5023+
if (Legal->hasUncountableEarlyExit())
5024+
return 1;
5025+
50045026
auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
50055027
const bool HasReductions = !Legal->getReductionVars().empty();
50065028

@@ -7813,11 +7835,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
78137835
// 2.5 When vectorizing the epilogue, fix reduction and induction resume
78147836
// values from the additional bypass block.
78157837
if (VectorizingEpilogue) {
7838+
assert(!ILV.Legal->hasUncountableEarlyExit() &&
7839+
"Epilogue vectorisation not yet supported with early exits");
78167840
BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
78177841
for (VPRecipeBase &R : *ExitVPBB) {
78187842
fixReductionScalarResumeWhenVectorizingEpilog(
78197843
&R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock);
78207844
}
7845+
78217846
BasicBlock *PH = OrigLoop->getLoopPreheader();
78227847
for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
78237848
auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
@@ -10177,13 +10202,33 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1017710202
return false;
1017810203
}
1017910204

10180-
if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10181-
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10182-
"early exit is not enabled",
10183-
"Auto-vectorization of loops with uncountable "
10184-
"early exit is not enabled",
10185-
"UncountableEarlyExitLoopsDisabled", ORE, L);
10186-
return false;
10205+
if (LVL.hasUncountableEarlyExit()) {
10206+
if (!EnableEarlyExitVectorization) {
10207+
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10208+
"early exit is not enabled",
10209+
"Auto-vectorization of loops with uncountable "
10210+
"early exit is not enabled",
10211+
"UncountableEarlyExitLoopsDisabled", ORE, L);
10212+
return false;
10213+
}
10214+
10215+
// Needed to prevent InnerLoopVectorizer::fixupIVUsers from crashing.
10216+
for (BasicBlock *BB : L->blocks()) {
10217+
for (Instruction &I : *BB) {
10218+
for (User *U : I.users()) {
10219+
Instruction *UI = cast<Instruction>(U);
10220+
if (!L->contains(UI)) {
10221+
reportVectorizationFailure(
10222+
"Auto-vectorization of loops with uncountable "
10223+
"early exit and live-outs is not yet supported",
10224+
"Auto-vectorization of loop with uncountable "
10225+
"early exit and live-outs is not yet supported",
10226+
"UncountableEarlyExitLoopLiveOutsUnsupported", ORE, L);
10227+
return false;
10228+
}
10229+
}
10230+
}
10231+
}
1018710232
}
1018810233

1018910234
// Entrance to the VPlan-native vectorization path. Outer loops are processed
@@ -10208,6 +10253,20 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1020810253
if (UseInterleaved)
1020910254
IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
1021010255

10256+
if (LVL.hasUncountableEarlyExit()) {
10257+
BasicBlock *LoopLatch = L->getLoopLatch();
10258+
if (IAI.requiresScalarEpilogue() ||
10259+
llvm::any_of(LVL.getCountableExitingBlocks(),
10260+
[LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10261+
reportVectorizationFailure("Auto-vectorization of early exit loops "
10262+
"requiring a scalar epilogue is unsupported",
10263+
"Auto-vectorization of early exit loops "
10264+
"requiring a scalar epilogue is unsupported",
10265+
"UncountableEarlyExitUnsupported", ORE, L);
10266+
return false;
10267+
}
10268+
}
10269+
1021110270
// Check the function attributes and profiles to find out if this function
1021210271
// should be optimized for size.
1021310272
ScalarEpilogueLowering SEL =

llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -S < %s -p loop-vectorize | FileCheck %s --check-prefixes=CHECK
2+
; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s --check-prefixes=CHECK
33

44
target triple = "aarch64-unknown-linux-gnu"
55

@@ -272,22 +272,66 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
272272
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
273273
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
274274
; CHECK-NEXT: [[END_CLAMPED:%.*]] = and i32 [[END]], 1023
275+
; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[END]] to i10
276+
; CHECK-NEXT: [[TMP20:%.*]] = zext i10 [[TMP19]] to i64
277+
; CHECK-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP20]], i64 1)
278+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 12
279+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
280+
; CHECK: vector.scevcheck:
281+
; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[END_CLAMPED]], i32 1)
282+
; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[UMAX]], -1
283+
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
284+
; CHECK-NEXT: [[TMP4:%.*]] = add i8 1, [[TMP3]]
285+
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i8 [[TMP4]], 1
286+
; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP2]], 255
287+
; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
288+
; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
289+
; CHECK: vector.ph:
290+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 4
291+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
292+
; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i8
275293
; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
294+
; CHECK: vector.body:
295+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ]
296+
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
297+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP8]]
298+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
299+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
300+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[TMP8]]
301+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
302+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
303+
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD3]]
304+
; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true)
305+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
306+
; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP14]], splat (i1 true)
307+
; CHECK-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP15]])
308+
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
309+
; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
310+
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_SPLIT:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
311+
; CHECK: middle.split:
312+
; CHECK-NEXT: br i1 [[TMP16]], label [[FOUND:%.*]], label [[MIDDLE_BLOCK:%.*]]
313+
; CHECK: middle.block:
314+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
315+
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
316+
; CHECK: scalar.ph:
317+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
318+
; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
319+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
276320
; CHECK: for.body:
277-
; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
278-
; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
321+
; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
322+
; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
279323
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]]
280324
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
281325
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]]
282326
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
283327
; CHECK-NEXT: [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
284-
; CHECK-NEXT: br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]]
328+
; CHECK-NEXT: br i1 [[CMP_EARLY]], label [[FOUND]], label [[FOR_INC]]
285329
; CHECK: for.inc:
286330
; CHECK-NEXT: [[IND_NEXT]] = add i8 [[IND]], 1
287331
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32
288332
; CHECK-NEXT: [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1
289333
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]]
290-
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY1]], label [[EXIT:%.*]]
334+
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
291335
; CHECK: found:
292336
; CHECK-NEXT: ret i32 1
293337
; CHECK: exit:
@@ -325,9 +369,39 @@ exit:
325369
ret i32 0
326370
}
327371

372+
%my.struct = type { i8, i8 }
373+
374+
define i64 @same_exit_block_requires_interleaving() {
375+
entry:
376+
%p1 = alloca [128 x %my.struct]
377+
call void @init_mem(ptr %p1, i64 256)
378+
br label %loop
379+
380+
loop:
381+
%index = phi i64 [ %index.next, %loop.latch ], [ 3, %entry ]
382+
%arrayidx = getelementptr inbounds [128 x %my.struct], ptr %p1, i64 0, i64 %index
383+
%ld1 = load i8, ptr %arrayidx, align 1
384+
%cmp3 = icmp eq i8 %ld1, 3
385+
br i1 %cmp3, label %loop.latch, label %loop.end
386+
387+
loop.latch:
388+
%index.next = add i64 %index, 1
389+
%exitcond = icmp ne i64 %index.next, 69
390+
br i1 %exitcond, label %loop, label %loop.end
391+
392+
loop.end:
393+
%retval = phi i64 [ 0, %loop.latch ], [ 1, %loop ]
394+
ret i64 %retval
395+
}
328396

329397
declare i32 @foo(i32) readonly
330398
declare <vscale x 4 x i32> @foo_vec(<vscale x 4 x i32>)
331399

332400
attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" }
333401
attributes #1 = { "target-features"="+sve" vscale_range(1,16) }
402+
;.
403+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
404+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
405+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
406+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
407+
;.

llvm/test/Transforms/LoopVectorize/early_exit_legality.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ define i64 @same_exit_block_pre_inc_use1() {
4949
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1'
5050
; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63
5151
; CHECK-NEXT: LV: We can vectorize this loop!
52-
; CHECK-NOT: LV: Not vectorizing
52+
; CHECK-NOT: LV: Auto-vectorization of loops with uncountable early exit and live-outs is not yet supported.
5353
entry:
5454
%p1 = alloca [1024 x i8]
5555
%p2 = alloca [1024 x i8]
@@ -141,7 +141,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
141141
; CHECK-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit'
142142
; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63
143143
; CHECK-NEXT: LV: We can vectorize this loop!
144-
; CHECK: LV: Not vectorizing: Some exit values in loop with uncountable exit not supported yet.
144+
; CHECK: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit and live-outs is not yet supported.
145145
entry:
146146
%p1 = alloca [1024 x i8]
147147
call void @init_mem(ptr %p1, i64 1024)

llvm/test/Transforms/LoopVectorize/multi_early_exit.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
2+
; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s
33

44
declare void @init_mem(ptr, i64);
55

llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
2+
; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s
33

44
declare void @init_mem(ptr, i64);
55

llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
2+
; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s
33

44
declare void @init_mem(ptr, i64);
55

0 commit comments

Comments
 (0)