Skip to content

[LoopVectorize] Don't replicate blocks with optsize #129265

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
23 changes: 23 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4526,6 +4526,12 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
return false;
}

static bool hasReplicatorRegion(VPlan &Plan) {
return any_of(VPBlockUtils::blocksOnly<VPRegionBlock>(vp_depth_first_shallow(
Plan.getVectorLoopRegion()->getEntry())),
[](auto *VPRB) { return VPRB->isReplicator(); });
}

#ifndef NDEBUG
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
Expand Down Expand Up @@ -4598,6 +4604,15 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
continue;
}

if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
LLVM_DEBUG(
dbgs()
<< "LV: Not considering vector loop of width " << VF
<< " because it would cause replicated blocks to be generated,"
<< " which isn't allowed when optimizing for size.\n");
continue;
}

if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
ChosenFactor = Candidate;
}
Expand Down Expand Up @@ -7771,6 +7786,14 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
<< " because it will not generate any vector instructions.\n");
continue;
}
if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
LLVM_DEBUG(
dbgs()
<< "LV: Not considering vector loop of width " << VF
<< " because it would cause replicated blocks to be generated,"
<< " which isn't allowed when optimizing for size.\n");
continue;
}

InstructionCost Cost = cost(*P, VF);
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt < %s -passes=loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
; RUN: opt < %s -passes=loop-vectorize,instcombine,simplifycfg -force-vector-width=2 -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
; RUN: opt < %s -passes=loop-vectorize,instcombine,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s

target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"

; This test checks that we correctly compute the scalarized operands for a
; user-specified vectorization factor when interleaving is disabled. We use the
; "optsize" attribute to disable all interleaving calculations. A cost of 4
; for %var4 indicates that we would scalarize it's operand (%var3), giving
; user-specified vectorization factor when interleaving is disabled. We use
; -force-vector-interleave=1 to disable all interleaving calculations. A cost of
; 4 for %var4 indicates that we would scalarize it's operand (%var3), giving
; %var4 a lower scalarization overhead.
;
; COST-LABEL: predicated_udiv_scalarized_operand
; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3
;
;
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
; CHECK-LABEL: @predicated_udiv_scalarized_operand(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1472,55 +1472,29 @@ exit:
ret void
}

define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
; DEFAULT-LABEL: define void @redundant_branch_and_tail_folding(
; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
; DEFAULT-NEXT: [[ENTRY:.*]]:
; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; DEFAULT: [[VECTOR_PH]]:
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
; DEFAULT: [[VECTOR_BODY]]:
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 20)
; DEFAULT-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
; DEFAULT-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; DEFAULT: [[PRED_STORE_IF]]:
; DEFAULT-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
; DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]]
; DEFAULT: [[PRED_STORE_CONTINUE]]:
; DEFAULT-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
; DEFAULT-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
; DEFAULT: [[PRED_STORE_IF1]]:
; DEFAULT-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
; DEFAULT-NEXT: store i32 [[TMP6]], ptr [[DST]], align 4
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; DEFAULT: [[PRED_STORE_CONTINUE2]]:
; DEFAULT-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
; DEFAULT: [[PRED_STORE_IF3]]:
; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
; DEFAULT-NEXT: store i32 [[TMP8]], ptr [[DST]], align 4
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE4]]
; DEFAULT: [[PRED_STORE_CONTINUE4]]:
; DEFAULT-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
; DEFAULT-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
; DEFAULT: [[PRED_STORE_IF5]]:
; DEFAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
; DEFAULT-NEXT: store i32 [[TMP10]], ptr [[DST]], align 4
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; DEFAULT: [[PRED_STORE_CONTINUE6]]:
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
; DEFAULT-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; DEFAULT-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; DEFAULT-NEXT: [[TMP0:%.*]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 1)
; DEFAULT-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i32>
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; DEFAULT-NEXT: store i32 [[TMP2]], ptr [[DST]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: br label %[[EXIT:.*]]
; DEFAULT-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; DEFAULT: [[SCALAR_PH]]:
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
; DEFAULT: [[LOOP_HEADER]]:
; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
Expand All @@ -1537,7 +1511,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
; DEFAULT-NEXT: ret void
;
; PRED-LABEL: define void @redundant_branch_and_tail_folding(
; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
; PRED-NEXT: [[ENTRY:.*]]:
; PRED-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; PRED: [[VECTOR_PH]]:
Expand Down
Loading
Loading