Skip to content

Commit eafbb87

Browse files
[LoopVectorize] Don't replicate blocks with optsize (#129265)
Any VPlan we generate that contains a replicator region will result in replicated blocks in the output, causing a large code size increase. Reject such VPlans when optimizing for size, as the code size impact is usually worse than having a scalar epilogue, which we already forbid with optsize. This change requires a lot of test changes. For tests of optsize specifically I've updated the test with the new output, otherwise the tests have been adjusted to not rely on optsize. Fixes #66652
1 parent 90ddb54 commit eafbb87

File tree

10 files changed

+94
-429
lines changed

10 files changed

+94
-429
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4526,6 +4526,12 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
45264526
return false;
45274527
}
45284528

4529+
static bool hasReplicatorRegion(VPlan &Plan) {
4530+
return any_of(VPBlockUtils::blocksOnly<VPRegionBlock>(vp_depth_first_shallow(
4531+
Plan.getVectorLoopRegion()->getEntry())),
4532+
[](auto *VPRB) { return VPRB->isReplicator(); });
4533+
}
4534+
45294535
#ifndef NDEBUG
45304536
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45314537
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
@@ -4598,6 +4604,15 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45984604
continue;
45994605
}
46004606

4607+
if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4608+
LLVM_DEBUG(
4609+
dbgs()
4610+
<< "LV: Not considering vector loop of width " << VF
4611+
<< " because it would cause replicated blocks to be generated,"
4612+
<< " which isn't allowed when optimizing for size.\n");
4613+
continue;
4614+
}
4615+
46014616
if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
46024617
ChosenFactor = Candidate;
46034618
}
@@ -7776,6 +7791,14 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
77767791
<< " because it will not generate any vector instructions.\n");
77777792
continue;
77787793
}
7794+
if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
7795+
LLVM_DEBUG(
7796+
dbgs()
7797+
<< "LV: Not considering vector loop of width " << VF
7798+
<< " because it would cause replicated blocks to be generated,"
7799+
<< " which isn't allowed when optimizing for size.\n");
7800+
continue;
7801+
}
77797802

77807803
InstructionCost Cost = cost(*P, VF);
77817804
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);

llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
; REQUIRES: asserts
3-
; RUN: opt < %s -passes=loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
4-
; RUN: opt < %s -passes=loop-vectorize,instcombine,simplifycfg -force-vector-width=2 -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
3+
; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
4+
; RUN: opt < %s -passes=loop-vectorize,instcombine,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
55

66
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
77
target triple = "aarch64--linux-gnu"
88

99
; This test checks that we correctly compute the scalarized operands for a
10-
; user-specified vectorization factor when interleaving is disabled. We use the
11-
; "optsize" attribute to disable all interleaving calculations. A cost of 4
12-
; for %var4 indicates that we would scalarize it's operand (%var3), giving
10+
; user-specified vectorization factor when interleaving is disabled. We use
11+
; -force-vector-interleave=1 to disable all interleaving calculations. A cost of
12+
; 4 for %var4 indicates that we would scalarize it's operand (%var3), giving
1313
; %var4 a lower scalarization overhead.
1414
;
1515
; COST-LABEL: predicated_udiv_scalarized_operand
1616
; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3
1717
;
1818
;
19-
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
19+
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
2020
; CHECK-LABEL: @predicated_udiv_scalarized_operand(
2121
; CHECK-NEXT: entry:
2222
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 16 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1472,55 +1472,29 @@ exit:
14721472
ret void
14731473
}
14741474

1475-
define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
1475+
define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
14761476
; DEFAULT-LABEL: define void @redundant_branch_and_tail_folding(
1477-
; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
1477+
; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
14781478
; DEFAULT-NEXT: [[ENTRY:.*]]:
14791479
; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
14801480
; DEFAULT: [[VECTOR_PH]]:
14811481
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
14821482
; DEFAULT: [[VECTOR_BODY]]:
1483-
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
1484-
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
1485-
; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 20)
1486-
; DEFAULT-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
1487-
; DEFAULT-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
1488-
; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
1489-
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
1490-
; DEFAULT: [[PRED_STORE_IF]]:
1491-
; DEFAULT-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
1492-
; DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4
1493-
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]]
1494-
; DEFAULT: [[PRED_STORE_CONTINUE]]:
1495-
; DEFAULT-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
1496-
; DEFAULT-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
1497-
; DEFAULT: [[PRED_STORE_IF1]]:
1498-
; DEFAULT-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
1499-
; DEFAULT-NEXT: store i32 [[TMP6]], ptr [[DST]], align 4
1500-
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE2]]
1501-
; DEFAULT: [[PRED_STORE_CONTINUE2]]:
1502-
; DEFAULT-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
1503-
; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
1504-
; DEFAULT: [[PRED_STORE_IF3]]:
1505-
; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
1506-
; DEFAULT-NEXT: store i32 [[TMP8]], ptr [[DST]], align 4
1507-
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE4]]
1508-
; DEFAULT: [[PRED_STORE_CONTINUE4]]:
1509-
; DEFAULT-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
1510-
; DEFAULT-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
1511-
; DEFAULT: [[PRED_STORE_IF5]]:
1512-
; DEFAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
1513-
; DEFAULT-NEXT: store i32 [[TMP10]], ptr [[DST]], align 4
1514-
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE6]]
1515-
; DEFAULT: [[PRED_STORE_CONTINUE6]]:
1516-
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1517-
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
1518-
; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
1519-
; DEFAULT-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1483+
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
1484+
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
1485+
; DEFAULT-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
1486+
; DEFAULT-NEXT: [[TMP0:%.*]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 1)
1487+
; DEFAULT-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i32>
1488+
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
1489+
; DEFAULT-NEXT: store i32 [[TMP2]], ptr [[DST]], align 4
1490+
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
1491+
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
1492+
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
1493+
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
15201494
; DEFAULT: [[MIDDLE_BLOCK]]:
1521-
; DEFAULT-NEXT: br label %[[EXIT:.*]]
1495+
; DEFAULT-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
15221496
; DEFAULT: [[SCALAR_PH]]:
1523-
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
1497+
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
15241498
; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
15251499
; DEFAULT: [[LOOP_HEADER]]:
15261500
; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -1537,7 +1511,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
15371511
; DEFAULT-NEXT: ret void
15381512
;
15391513
; PRED-LABEL: define void @redundant_branch_and_tail_folding(
1540-
; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
1514+
; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
15411515
; PRED-NEXT: [[ENTRY:.*]]:
15421516
; PRED-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
15431517
; PRED: [[VECTOR_PH]]:

0 commit comments

Comments
 (0)