-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LoopVectorize] Add cost of generating tail-folding mask to the loop #90191
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: David Sherwood (david-arm) ChangesAt the moment if we decide to enable tail-folding we do not include the cost of generating the mask per VF. This can mean we make some poor choices of VF, which is definitely true for SVE-enabled AArch64 targets where mask generation for fixed-width vectors is more expensive than for scalable vectors. New tests added: Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll Patch is 21.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90191.diff 6 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c2..29333f1b7813e3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1610,6 +1610,9 @@ class LoopVectorizationCostModel {
/// vectorization has actually taken place).
using VectorizationCostTy = std::pair<InstructionCost, bool>;
+ /// Calculate the cost of generating the mask when tail-folding for the VF.
+ InstructionCost getTailFoldMaskCost(ElementCount VF);
+
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
@@ -5945,6 +5948,35 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
return Discount;
}
+InstructionCost
+LoopVectorizationCostModel::getTailFoldMaskCost(ElementCount VF) {
+ if (VF.isScalar())
+ return 0;
+
+ InstructionCost MaskCost;
+ Type *IndTy = Legal->getWidestInductionType();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ TailFoldingStyle Style = getTailFoldingStyle();
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ VectorType *RetTy = VectorType::get(IntegerType::getInt1Ty(Context), VF);
+ if (useActiveLaneMask(Style)) {
+ IntrinsicCostAttributes Attrs(
+ Intrinsic::get_active_lane_mask, RetTy,
+ {PoisonValue::get(IndTy), PoisonValue::get(IndTy)});
+ MaskCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
+ } else {
+ // This is just a stepvector, added to a splat of the current IV, followed
+ // by a vector comparison with a splat of the trip count. Since the
+ // stepvector is loop invariant it will be hoisted out so we can ignore it.
+ // This just leaves us with an add and an icmp.
+ VectorType *VecTy = VectorType::get(IndTy, VF);
+ MaskCost = TTI.getArithmeticInstrCost(Instruction::Add, VecTy, CostKind);
+ MaskCost += TTI.getCmpSelInstrCost(Instruction::ICmp, VecTy, RetTy,
+ ICmpInst::ICMP_ULE, CostKind, nullptr);
+ }
+ return MaskCost;
+}
+
LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::expectedCost(
ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
@@ -5993,6 +6025,15 @@ LoopVectorizationCostModel::expectedCost(
Cost.second |= BlockCost.second;
}
+ // If we're using tail-folding then we should add the cost of generating the
+ // mask.
+ if (foldTailByMasking()) {
+ InstructionCost MaskCost = getTailFoldMaskCost(VF);
+ LLVM_DEBUG(dbgs() << "LV: Adding cost of generating tail-fold mask for VF "
+ << VF << ": " << MaskCost << '\n');
+ Cost.first += MaskCost;
+ }
+
return Cost;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
index af266653af8033..7d473e01022595 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
@@ -1,4 +1,7 @@
-; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue <%s | FileCheck %s
+; REQUIRES: asserts
+; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -debug-only=loop-vectorize <%s 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=COST
target triple = "aarch64-unknown-linux-gnu"
@@ -32,4 +35,29 @@ for.end: ; preds = %for.body
ret i32 0
}
+
+; COST: LV: Checking a loop in 'simple_memset'
+; COST: LV: Adding cost of generating tail-fold mask for VF 1: 0
+; COST: LV: Adding cost of generating tail-fold mask for VF 2: 4
+; COST: LV: Adding cost of generating tail-fold mask for VF 4: 8
+; COST: LV: Adding cost of generating tail-fold mask for VF vscale x 1: 4
+; COST: LV: Adding cost of generating tail-fold mask for VF vscale x 2: 1
+; COST: LV: Adding cost of generating tail-fold mask for VF vscale x 4: 1
+
+define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
+entry:
+ br label %while.body
+
+while.body: ; preds = %while.body, %entry
+ %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
+ %gep = getelementptr i32, ptr %ptr, i64 %index
+ store i32 %val, ptr %gep
+ %index.next = add nsw i64 %index, 1
+ %cmp10 = icmp ult i64 %index.next, %n
+ br i1 %cmp10, label %while.body, label %while.end.loopexit
+
+while.end.loopexit: ; preds = %while.body
+ ret void
+}
+
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index 596e42e9f094de..042cb11dc48b3b 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -15,6 +15,7 @@ define void @pred_loop(ptr %off, ptr %data, ptr %dst, i32 %n) #0 {
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add1, ptr %arrayidx2, align 4
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond.not = icmp eq i32 %add, %n
; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond.not, label %exit.loopexit, label %for.body
+; CHECK-COST-NEXT: LV: Adding cost of generating tail-fold mask: 0
; CHECK-COST-NEXT: LV: Scalar loop costs: 5.
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
new file mode 100644
index 00000000000000..9e8314fecb5851
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
@@ -0,0 +1,27 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple riscv64-linux-gnu -mattr=+v,+f -S -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+; CHECK: LV: Adding cost of generating tail-fold mask for VF 1: 0
+; CHECK: LV: Adding cost of generating tail-fold mask for VF 2: 2
+; CHECK: LV: Adding cost of generating tail-fold mask for VF 4: 4
+; CHECK: LV: Adding cost of generating tail-fold mask for VF 8: 8
+; CHECK: LV: Adding cost of generating tail-fold mask for VF vscale x 1: 2
+; CHECK: LV: Adding cost of generating tail-fold mask for VF vscale x 2: 4
+; CHECK: LV: Adding cost of generating tail-fold mask for VF vscale x 4: 8
+
+define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
+entry:
+ br label %while.body
+
+while.body: ; preds = %while.body, %entry
+ %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
+ %gep = getelementptr i32, ptr %ptr, i64 %index
+ store i32 %val, ptr %gep
+ %index.next = add nsw i64 %index, 1
+ %cmp10 = icmp ult i64 %index.next, %n
+ br i1 %cmp10, label %while.body, label %while.end.loopexit
+
+while.end.loopexit: ; preds = %while.body
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 1ce4cb928e8085..f625d98a263d1a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -497,49 +497,21 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
;
; TF-FIXEDLEN-LABEL: @conditional_uniform_load(
; TF-FIXEDLEN-NEXT: entry:
-; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; TF-FIXEDLEN: vector.ph:
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]]
-; TF-FIXEDLEN: vector.body:
-; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 1025)
-; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
-; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
-; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT: [[TMP6:%.*]] = or <4 x i1> [[TMP2]], [[TMP4]]
-; TF-FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
-; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
-; TF-FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <4 x i1> [[TMP6]])
-; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; TF-FIXEDLEN-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; TF-FIXEDLEN: middle.block:
-; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; TF-FIXEDLEN: scalar.ph:
-; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1028, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; TF-FIXEDLEN-NEXT: br label [[FOR_BODY:%.*]]
; TF-FIXEDLEN: for.body:
-; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
; TF-FIXEDLEN-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
; TF-FIXEDLEN-NEXT: br i1 [[CMP]], label [[DO_LOAD:%.*]], label [[LATCH]]
; TF-FIXEDLEN: do_load:
-; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8
+; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
; TF-FIXEDLEN-NEXT: br label [[LATCH]]
; TF-FIXEDLEN: latch:
; TF-FIXEDLEN-NEXT: [[PHI:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[V]], [[DO_LOAD]] ]
-; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
; TF-FIXEDLEN-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; TF-FIXEDLEN: for.end:
; TF-FIXEDLEN-NEXT: ret void
;
@@ -708,7 +680,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; TF-FIXEDLEN: middle.block:
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; TF-FIXEDLEN: scalar.ph:
@@ -721,7 +693,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; TF-FIXEDLEN: for.end:
; TF-FIXEDLEN-NEXT: ret void
;
@@ -883,7 +855,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; TF-FIXEDLEN: middle.block:
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; TF-FIXEDLEN: scalar.ph:
@@ -896,7 +868,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; TF-FIXEDLEN: for.end:
; TF-FIXEDLEN-NEXT: ret void
;
@@ -1114,7 +1086,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; TF-FIXEDLEN-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; TF-FIXEDLEN: middle.block:
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; TF-FIXEDLEN: scalar.ph:
@@ -1127,7 +1099,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; TF-FIXEDLEN: for.end:
; TF-FIXEDLEN-NEXT: ret void
;
@@ -1353,7 +1325,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
; TF-FIXEDLEN-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; TF-FIXEDLEN: middle.block:
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; TF-FIXEDLEN: scalar.ph:
@@ -1371,7 +1343,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; TF-FIXEDLEN: for.end:
; TF-FIXEDLEN-NEXT: ret void
;
@@ -1539,7 +1511,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; TF-FIXEDLEN: middle.block:
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; TF-FIXEDLEN: scalar.ph:
@@ -1552,7 +1524,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; TF-FIXEDLEN: for.end:
; TF-FIXEDLEN-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
index 3f38abc75a5837..aed7cb6f517b7b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -passes=loop-vectorize < %s | FileCheck %s
+; RUN: opt -S -passes=loop-vectorize -force-vector-width=4 < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -16,10 +16,10 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
; CHECK-NEXT: bb5:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 99, i64 98, i64 97, i64 96>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_HEADER]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 99, i64 98, i64 97, i64 96>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_HEADER]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 99, [[INDEX]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
@@ -38,1...
[truncated]
|
Gentle ping! |
; TF-FIXEDLEN-NEXT: br label [[FOR_BODY:%.*]] | ||
; TF-FIXEDLEN: for.body: | ||
; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] | ||
; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems like this may have inadvertently broken a RVV functionality test? Though the scalable version still works...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's because the cost model is now teaching the vectoriser to behave sensibly, i.e. when using tail-folding if we only have fixed-width vectors available it's cheaper to use a scalar loop. That's presumably because for RISCV the compiler expands the active.lane mask into a horrible sequence of instructions. I did think about trying to somehow hack the RUN lines or add loop attributes to force tail-folding for fixed-width vectors, but given there is still testing coverage for the scalable case I figured I'd just leave it. Unless someone from RISCV has a preference - @topperc?
Rebase. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tests have changed because the active lane mask cost is the same for VF=8 and VF=16. As a result the per scalar iteration cost is lower for VF=16.
@@ -430,116 +430,41 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca | |||
; | |||
; TF-SCALABLE-LABEL: @conditional_uniform_load( | |||
; TF-SCALABLE-NEXT: entry: | |||
; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both TF-SCALABLE and TF-FIXED now fail to vectorise because the additional lane mask cost has just pushed it over the cost of the scalar loop. At least for the scalable case this may indicate the cost model for get.active.lane.mask calls needs improving?
} else { | ||
// This is just a stepvector, added to a splat of the current IV, followed | ||
// by a vector comparison with a splat of the trip count. Since the | ||
// stepvector is loop invariant it will be hoisted out so we can ignore it. | ||
// This just leaves us with an add and an icmp. | ||
VectorType *VecTy = VectorType::get(IndTy, VF); | ||
MaskCost = TTI.getArithmeticInstrCost(Instruction::Add, VecTy, CostKind); | ||
MaskCost += TTI.getCmpSelInstrCost(Instruction::ICmp, VecTy, RetTy, | ||
ICmpInst::ICMP_ULE, CostKind, nullptr); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is wrong for tailfolding with EVL, need a special case for this. The cost for data-with-evl tail folding style is the cost of sub i64 + cost of i32 @llvm.experimental.get.vector.length
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Initial cost-modeling based on VPlan is closed to get added (#67934). it runs at the point where those decisions are materialized, so handling all cases correctly could be done in a more straight forward manner by simply implementing the costs of the EVL/ActiveLansMask/header mask compare.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If #67934 is likely to land in the next week or so, I'm happy to hold off and base it off #67934. I can take a look at it, but I may need some help understanding how to do that. :)
Does your patch automatically solve this problem @fhahn, i.e. because it walks through every VPInstruction generated in the plan and so essentially there is no extra work needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've updated this patch for reference anyway with the code changes and I've added some EVL check lines to the RISCV test. The test changes will be needed regardless.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If #67934 is likely to land in the next week or so, I'm happy to hold off and base it off #67934. I can take a look at it, but I may need some help understanding how to do that. :)
Not sure about this week, but hopefully soon as all pending patches have landed.
Does your patch automatically solve this problem @fhahn, i.e. because it walks through every VPInstruction generated in the plan and so essentially there is no extra work needed?
It won't solve it automatically; to solve it, VPInstruction::computeCost
needs to be implemented for ActiveLaneMask
, GetVectorLength
and also VPWidenCanonicalIVRecipe::compueCost
. Doing it that way would ensure we accurately cost what is actually code-gen'd (and avoid the issue with not knowing if a splat + add of step-vector will be needed as per my comment above)
// stepvector is loop invariant it will be hoisted out so we can ignore it. | ||
// This just leaves us with an add and an icmp. | ||
VectorType *VecTy = VectorType::get(IndTy, VF); | ||
MaskCost = TTI.getArithmeticInstrCost(Instruction::Add, VecTy, CostKind); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this entirely accurate? We might need to also broadcast the canonical IV or there may be not increment needed, if we can re-use an existing widened IV. Unfortunately I am not sure if we can query this accurately at this point.
} else { | ||
// This is just a stepvector, added to a splat of the current IV, followed | ||
// by a vector comparison with a splat of the trip count. Since the | ||
// stepvector is loop invariant it will be hoisted out so we can ignore it. | ||
// This just leaves us with an add and an icmp. | ||
VectorType *VecTy = VectorType::get(IndTy, VF); | ||
MaskCost = TTI.getArithmeticInstrCost(Instruction::Add, VecTy, CostKind); | ||
MaskCost += TTI.getCmpSelInstrCost(Instruction::ICmp, VecTy, RetTy, | ||
ICmpInst::ICMP_ULE, CostKind, nullptr); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If #67934 is likely to land in the next week or so, I'm happy to hold off and base it off #67934. I can take a look at it, but I may need some help understanding how to do that. :)
Not sure about this week, but hopefully soon as all pending patches have landed.
Does your patch automatically solve this problem @fhahn, i.e. because it walks through every VPInstruction generated in the plan and so essentially there is no extra work needed?
It won't solve it automatically; to solve it, VPInstruction::computeCost
needs to be implemented for ActiveLaneMask
, GetVectorLength
and also VPWidenCanonicalIVRecipe::compueCost
. Doing it that way would ensure we accurately cost what is actually code-gen'd (and avoid the issue with not knowing if a splat + add of step-vector will be needed as per my comment above)
Hi @john-brawn-arm, whilst reviewing your PR #109289 I discovered I still have this patch that I created to add the cost of tail-folding to the loop. I think it may also fix the same issue you're looking at, albeit in a more accidental way. At a reviewer request I held off doing further work on this PR until we moved over to the vplan cost model so that I could implement it differently, however it looks like we haven't moved over yet. @fhahn do you have any idea when we'll remove the assert that legacy cost == vplan cost? Anyway, for the purpose of testing your issue @john-brawn-arm you can still try out this patch - I've simply commented out the assert that the legacy cost == vplan cost for now. |
Looking at store_const_fixed_trip_count in the tail-folding.ll test added by PR #109289, it has estimated cost
This PR gives estimated cost
Adding the cost of generating the tail-fold mask doesn't counteract the lack of the cost of each scalarized store requiring a branch, so VF 8 is still selected. |
Currently it's not possible to improve the cost model for tail-folded loops because as soon as you add a VPInstruction::computeCost function that adds the costs of instructions such as VPInstruction::ActiveLaneMask and VPInstruction::ExplicitVectorLength the assert in LoopVectorizationPlanner::computeBestVF fails for some tests. This is because the VF chosen by the legacy cost model doesn't match the vplan cost model. See PR llvm#90191. This assert is currently inhibiting attempts to improve the cost model. I would like to remove the assert since we've been using the vplan cost model for 2 months now and that feels long enough to me. However, in order to do that we have to fix up a whole bunch of tests that rely upon the legacy cost model output. I've tried my best to update these tests to use vplan output instead. There are still a whole bunch of vectoriser tests in Analysis/CostModel/X86 that depend upon the legacy cost model, and also I feel they shouldn't really live there either. These can be fixed up in a separate patch!
At the moment if we decide to enable tail-folding we do not include the cost of generating the mask per VF. This can mean we make some poor choices of VF, which is definitely true for SVE-enabled AArch64 targets where mask generation for fixed-width vectors is more expensive than for scalable vectors. I've added a VPInstruction::computeCost function to return the costs of the ActiveLaneMask and ExplicitVectorLength operations. Unfortunately, in order to prevent asserts firing I've also had to duplicate the same code in the legacy cost model to make sure the chosen VFs match up. I've wrapped this up in a ifndef NDEBUG for now. New tests added: Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
When adding the cost of the lane mask to the loop in some cases we just decide it's not worth vectorising. Whilst this is probably the correct decision from a performance perspective, it does mean I had to add attributes so that we're still testing the code paths intended by the original tests. |
Currently it's very difficult to improve the cost model for tail-folded loops because as soon as you add a VPInstruction::computeCost function that adds the costs of instructions such as VPInstruction::ActiveLaneMask and VPInstruction::ExplicitVectorLength the assert in LoopVectorizationPlanner::computeBestVF fails for some tests. This is because the VF chosen by the legacy cost model doesn't match the vplan cost model. See PR llvm#90191. This assert is currently making it difficult to improve the cost model. Hopefully we will be in a position to remove the assert soon, however in order to do that we have to fix up a whole bunch of tests that rely upon the legacy cost model output. I've tried my best to update these tests to use vplan output instead. There is still work needed for the VF=1 case because the vplan cost model is not printed out in this case. I've not attempted to fix those in this patch.
Currently it's very difficult to improve the cost model for tail-folded loops because as soon as you add a VPInstruction::computeCost function that adds the costs of instructions such as VPInstruction::ActiveLaneMask and VPInstruction::ExplicitVectorLength the assert in LoopVectorizationPlanner::computeBestVF fails for some tests. This is because the VF chosen by the legacy cost model doesn't match the vplan cost model. See PR llvm#90191. This assert is currently making it difficult to improve the cost model. Hopefully we will be in a position to remove the assert soon, however in order to do that we have to fix up a whole bunch of tests that rely upon the legacy cost model output. I've tried my best to update these tests to use vplan output instead. There is still work needed for the VF=1 case because the vplan cost model is not printed out in this case. I've not attempted to fix those in this patch.
#113697) Currently it's very difficult to improve the cost model for tail-folded loops because as soon as you add a VPInstruction::computeCost function that adds the costs of instructions such as VPInstruction::ActiveLaneMask and VPInstruction::ExplicitVectorLength the assert in LoopVectorizationPlanner::computeBestVF fails for some tests. This is because the VF chosen by the legacy cost model doesn't match the vplan cost model. See PR #90191. This assert is currently making it difficult to improve the cost model. Hopefully we will be in a position to remove the assert soon, however in order to do that we have to fix up a whole bunch of tests that rely upon the legacy cost model output. I've tried my best to update these tests to use vplan output instead. There is still work needed for the VF=1 case because the vplan cost model is not printed out in this case. I've not attempted to fix those in this patch.
At the moment if we decide to enable tail-folding we do not include the cost of generating the mask per VF. This can mean we make some poor choices of VF, which is definitely true for SVE-enabled AArch64 targets where mask generation for fixed-width vectors is more expensive than for scalable vectors.
New tests added:
Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
Transforms/LoopVectorize/RISCV/tail-folding-cost.ll