Skip to content

Commit 2b5ea6b

Browse files
[LoopVectorize] Don't replicate blocks with optsize
Any VPlan we generate that contains a replicator region will result in replicated blocks in the output, causing a large code size increase. Reject such VPlans when optimizing for size, as the code size impact is usually worse than having a scalar epilogue, which we already forbid with optsize. This change requires a lot of test changes. For tests of optsize specifically I've updated the test with the new output, otherwise the tests have been adjusted to not rely on optsize. Fixes #66652
1 parent b3b0070 commit 2b5ea6b

File tree

10 files changed

+71
-438
lines changed

10 files changed

+71
-438
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4565,6 +4565,14 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
45654565
return false;
45664566
}
45674567

4568+
static bool hasReplicatorRegion(VPlan &Plan) {
4569+
for (auto *VPRB : VPBlockUtils::blocksOnly<VPRegionBlock>(
4570+
vp_depth_first_deep(Plan.getEntry())))
4571+
if (VPRB->isReplicator())
4572+
return true;
4573+
return false;
4574+
}
4575+
45684576
#ifndef NDEBUG
45694577
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45704578
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
@@ -4614,6 +4622,15 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
46144622
continue;
46154623
}
46164624

4625+
if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4626+
LLVM_DEBUG(
4627+
dbgs()
4628+
<< "LV: Not considering vector loop of width " << VF
4629+
<< " because it would cause replicated blocks to be generated,"
4630+
<< " which isn't allowed when optimizing for size.\n");
4631+
continue;
4632+
}
4633+
46174634
if (isMoreProfitable(Candidate, ChosenFactor))
46184635
ChosenFactor = Candidate;
46194636
}
@@ -7537,6 +7554,14 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
75377554
<< " because it will not generate any vector instructions.\n");
75387555
continue;
75397556
}
7557+
if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
7558+
LLVM_DEBUG(
7559+
dbgs()
7560+
<< "LV: Not considering vector loop of width " << VF
7561+
<< " because it would cause replicated blocks to be generated,"
7562+
<< " which isn't allowed when optimizing for size.\n");
7563+
continue;
7564+
}
75407565

75417566
InstructionCost Cost = cost(*P, VF);
75427567
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);

llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
; REQUIRES: asserts
3-
; RUN: opt < %s -passes=loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
4-
; RUN: opt < %s -passes=loop-vectorize,instcombine,simplifycfg -force-vector-width=2 -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
3+
; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
4+
; RUN: opt < %s -passes=loop-vectorize,instcombine,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
55

66
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
77
target triple = "aarch64--linux-gnu"
88

99
; This test checks that we correctly compute the scalarized operands for a
10-
; user-specified vectorization factor when interleaving is disabled. We use the
11-
; "optsize" attribute to disable all interleaving calculations. A cost of 4
12-
; for %var4 indicates that we would scalarize it's operand (%var3), giving
10+
; user-specified vectorization factor when interleaving is disabled. We use
11+
; -force-vector-interleave=1 to disable all interleaving calculations. A cost of
12+
; 4 for %var4 indicates that we would scalarize it's operand (%var3), giving
1313
; %var4 a lower scalarization overhead.
1414
;
1515
; COST-LABEL: predicated_udiv_scalarized_operand
1616
; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3
1717
;
1818
;
19-
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
19+
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
2020
; CHECK-LABEL: @predicated_udiv_scalarized_operand(
2121
; CHECK-NEXT: entry:
2222
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 11 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1584,55 +1584,29 @@ exit:
15841584
ret void
15851585
}
15861586

1587-
define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
1587+
define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
15881588
; DEFAULT-LABEL: define void @redundant_branch_and_tail_folding(
1589-
; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
1589+
; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
15901590
; DEFAULT-NEXT: entry:
15911591
; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
15921592
; DEFAULT: vector.ph:
15931593
; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
15941594
; DEFAULT: vector.body:
1595-
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
1596-
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
1597-
; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 20)
1595+
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1596+
; DEFAULT-NEXT: [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1597+
; DEFAULT-NEXT: [[VEC_IND:%.*]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
15981598
; DEFAULT-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
15991599
; DEFAULT-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
1600-
; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
1601-
; DEFAULT-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
1602-
; DEFAULT: pred.store.if:
1603-
; DEFAULT-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
1604-
; DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4
1605-
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE]]
1606-
; DEFAULT: pred.store.continue:
1607-
; DEFAULT-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
1608-
; DEFAULT-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
1609-
; DEFAULT: pred.store.if1:
1610-
; DEFAULT-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
1611-
; DEFAULT-NEXT: store i32 [[TMP6]], ptr [[DST]], align 4
1612-
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE2]]
1613-
; DEFAULT: pred.store.continue2:
1614-
; DEFAULT-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
1615-
; DEFAULT-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
1616-
; DEFAULT: pred.store.if3:
1617-
; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
1618-
; DEFAULT-NEXT: store i32 [[TMP8]], ptr [[DST]], align 4
1619-
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE4]]
1620-
; DEFAULT: pred.store.continue4:
1621-
; DEFAULT-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
1622-
; DEFAULT-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
1623-
; DEFAULT: pred.store.if5:
16241600
; DEFAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
16251601
; DEFAULT-NEXT: store i32 [[TMP10]], ptr [[DST]], align 4
1626-
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE6]]
1627-
; DEFAULT: pred.store.continue6:
1628-
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1602+
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
16291603
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
1630-
; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
1631-
; DEFAULT-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1604+
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
1605+
; DEFAULT-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
16321606
; DEFAULT: middle.block:
1633-
; DEFAULT-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
1607+
; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
16341608
; DEFAULT: scalar.ph:
1635-
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1609+
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
16361610
; DEFAULT-NEXT: br label [[LOOP_HEADER:%.*]]
16371611
; DEFAULT: loop.header:
16381612
; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -1649,7 +1623,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
16491623
; DEFAULT-NEXT: ret void
16501624
;
16511625
; PRED-LABEL: define void @redundant_branch_and_tail_folding(
1652-
; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
1626+
; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
16531627
; PRED-NEXT: entry:
16541628
; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
16551629
; PRED: vector.ph:

0 commit comments

Comments
 (0)