-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LoopVectorize] Add cost of generating tail-folding mask to the loop #130565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: David Sherwood (david-arm) ChangesAt the moment if we decide to enable tail-folding we do not include I've added a VPInstruction::computeCost function to return the costs New tests added: Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll Patch is 80.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130565.diff 10 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 74ddf906ff9fd..e4cb3231c3e31 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5610,6 +5610,34 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
Cost += BlockCost;
}
+#ifndef NDEBUG
+ // TODO: We're effectively having to duplicate the code from
+ // VPInstruction::computeCost, which is ugly. This isn't meant to be a fully
+ // accurate representation of the cost of tail-folding - it exists purely to
+ // stop asserts firing when the legacy cost doesn't match the VPlan cost.
+ if (!VF.isScalar() && foldTailByMasking()) {
+ TailFoldingStyle Style = getTailFoldingStyle();
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ Type *I1Ty = IntegerType::getInt1Ty(Context);
+ Type *IndTy = Legal->getWidestInductionType();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ if (Style == TailFoldingStyle::DataWithEVL) {
+ Type *I32Ty = IntegerType::getInt32Ty(Context);
+ IntrinsicCostAttributes Attrs(
+ Intrinsic::experimental_get_vector_length, I32Ty,
+ {PoisonValue::get(IndTy), PoisonValue::get(I32Ty),
+ PoisonValue::get(I1Ty)});
+ Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind);
+ } else if (useActiveLaneMask(Style)) {
+ VectorType *RetTy = VectorType::get(I1Ty, VF);
+ IntrinsicCostAttributes Attrs(
+ Intrinsic::get_active_lane_mask, RetTy,
+ {PoisonValue::get(IndTy), PoisonValue::get(IndTy)});
+ Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind);
+ }
+ }
+#endif
+
return Cost;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5010bf029d140..297ffa46c8dba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -743,6 +743,21 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
return Ctx.TTI.getArithmeticReductionCost(
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
+ case VPInstruction::ActiveLaneMask: {
+ Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
+ Type *Arg1Ty = Ctx.Types.inferScalarType(getOperand(1));
+ Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
+ IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
+ {Arg0Ty, Arg1Ty});
+ return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+ }
+ case VPInstruction::ExplicitVectorLength: {
+ Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
+ Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
+ IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
+ I32Ty, {I32Ty, I1Ty});
+ return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+ }
default:
// TODO: Compute cost other VPInstructions once the legacy cost model has
// been retired.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index ce24d3cfded22..5ce5df6ad3444 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -613,112 +613,9 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; PRED-LABEL: define i32 @header_mask_and_invariant_compare(
; PRED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; PRED-NEXT: entry:
-; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; PRED: vector.memcheck:
-; PRED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4
-; PRED-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 2
-; PRED-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 4
-; PRED-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP2]]
-; PRED-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 4
-; PRED-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 4
-; PRED-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[C]], i64 4
-; PRED-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[E]], [[SCEVGEP1]]
-; PRED-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[D]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; PRED-NEXT: [[BOUND05:%.*]] = icmp ult ptr [[E]], [[SCEVGEP2]]
-; PRED-NEXT: [[BOUND16:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
-; PRED-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
-; PRED-NEXT: [[BOUND08:%.*]] = icmp ult ptr [[E]], [[SCEVGEP3]]
-; PRED-NEXT: [[BOUND19:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
-; PRED-NEXT: [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
-; PRED-NEXT: [[BOUND012:%.*]] = icmp ult ptr [[E]], [[SCEVGEP4]]
-; PRED-NEXT: [[BOUND113:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; PRED-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
-; PRED-NEXT: [[BOUND016:%.*]] = icmp ult ptr [[D]], [[SCEVGEP2]]
-; PRED-NEXT: [[BOUND117:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
-; PRED-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
-; PRED-NEXT: [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
-; PRED-NEXT: [[BOUND020:%.*]] = icmp ult ptr [[D]], [[SCEVGEP3]]
-; PRED-NEXT: [[BOUND121:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
-; PRED-NEXT: [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
-; PRED-NEXT: [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
-; PRED-NEXT: [[BOUND024:%.*]] = icmp ult ptr [[D]], [[SCEVGEP4]]
-; PRED-NEXT: [[BOUND125:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]]
-; PRED-NEXT: [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
-; PRED-NEXT: [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
-; PRED-NEXT: br i1 [[CONFLICT_RDX27]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; PRED: vector.ph:
-; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT: [[TMP12:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
-; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
-; PRED: vector.body:
-; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE37]] ]
-; PRED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0
-; PRED-NEXT: [[TMP7:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META4:![0-9]+]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP8:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META7:![0-9]+]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP9:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
-; PRED-NEXT: [[TMP10:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META9:![0-9]+]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP11:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP9]]
-; PRED-NEXT: [[TMP25:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
-; PRED-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[D]], i64 [[TMP15]]
-; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP25]], i32 0
-; PRED-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; PRED: pred.store.if:
-; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP9]], i32 0
-; PRED-NEXT: store i32 [[TMP27]], ptr [[E]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE]]
-; PRED: pred.store.continue:
-; PRED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP25]], i32 1
-; PRED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
-; PRED: pred.store.if32:
-; PRED-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP9]], i32 1
-; PRED-NEXT: store i32 [[TMP17]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE33]]
-; PRED: pred.store.continue33:
-; PRED-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP25]], i32 2
-; PRED-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]]
-; PRED: pred.store.if34:
-; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP9]], i32 2
-; PRED-NEXT: store i32 [[TMP19]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE35]]
-; PRED: pred.store.continue35:
-; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP25]], i32 3
-; PRED-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]]
-; PRED: pred.store.if36:
-; PRED-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP9]], i32 3
-; PRED-NEXT: store i32 [[TMP21]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE37]]
-; PRED: pred.store.continue37:
-; PRED-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
-; PRED-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP23]], i32 4, <4 x i1> [[TMP25]]), !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
-; PRED-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP28]], i32 0
-; PRED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; PRED: middle.block:
-; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; PRED: scalar.ph:
-; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; PRED-NEXT: br label [[LOOP_HEADER:%.*]]
; PRED: loop.header:
-; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
; PRED-NEXT: [[L_A:%.*]] = load i32, ptr [[A]], align 4
; PRED-NEXT: [[L_B:%.*]] = load i32, ptr [[B]], align 4
; PRED-NEXT: [[OR:%.*]] = or i32 [[L_B]], [[L_A]]
@@ -733,7 +630,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; PRED: loop.latch:
; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; PRED-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT: br i1 [[C_1]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP18:![0-9]+]]
+; PRED-NEXT: br i1 [[C_1]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
; PRED: exit:
; PRED-NEXT: ret i32 0
;
@@ -848,7 +745,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
-; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; PRED: scalar.ph:
@@ -866,7 +763,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8
; PRED-NEXT: [[IV_CLAMP:%.*]] = and i64 [[IV]], 4294967294
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_CLAMP]], 512
-; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
+; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; PRED: exit:
; PRED-NEXT: ret void
;
@@ -1078,7 +975,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED: pred.store.continue14:
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; PRED-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; PRED-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; PRED: scalar.ph:
@@ -1091,7 +988,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
-; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
+; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; PRED: exit:
; PRED-NEXT: ret void
;
@@ -1116,7 +1013,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-SAME: ptr noalias [[SRC_1:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[SRC_3:%.*]], ptr noalias [[SRC_4:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; DEFAULT: vector.scevcheck:
; DEFAULT-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
@@ -1358,7 +1255,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW7]]
; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP4]], [[TMP8]]
; PRED-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
-; PRED-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[ENTRY:%.*]]
; PRED: vector.ph:
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
@@ -1367,16 +1264,16 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[TMP0]], 8
; PRED-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
+; PRED-NEXT: br label [[LOOP_HEADER:%.*]]
; PRED: vector.body:
-; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE27:%.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
-; PRED-NEXT: [[TMP18:%.*]] = load float, ptr [[SRC_1]], align 4
-; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
+; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[LOOP_LATCH]] ]
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_LATCH]] ]
+; PRED-NEXT: [[TMP86:%.*]] = load float, ptr [[SRC_1]], align 4
+; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP86]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
-; PRED-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_2]], align 4
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; PRED-NEXT: [[TMP87:%.*]] = load float, ptr [[SRC_2]], align 4
+; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP87]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
; PRED-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
; PRED-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP20]])
@@ -1496,7 +1393,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: br label [[PRED_STORE_CONTINUE25]]
; PRED: pred.store.continue25:
; PRED-NEXT: [[TMP77:%.*]] = extractelement <8 x i1> [[TMP26]], i32 7
-; PRED-NEXT: br i1 [[TMP77]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27]]
+; PRED-NEXT: br i1 [[TMP77]], label [[PRED_STORE_IF26:%.*]], label [[LOOP_LATCH]]
; PRED: pred.store.if26:
; PRED-NEXT: [[TMP78:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP78]], align 4
@@ -1508,32 +1405,32 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP82]], align 4
; PRED-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP83]], align 4
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE27]]
+; PRED-NEXT: br label [[LOOP_LATCH]]
; PRED: pred.store.continue27:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP17]])
+; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8
+; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[IV]], i64 [[TMP17]])
; PRED-NEXT: [[TMP84:%.*]] = xor <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64...
[truncated]
|
@llvm/pr-subscribers-vectorizers Author: David Sherwood (david-arm) ChangesAt the moment if we decide to enable tail-folding we do not include I've added a VPInstruction::computeCost function to return the costs New tests added: Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll Patch is 80.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130565.diff 10 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 74ddf906ff9fd..e4cb3231c3e31 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5610,6 +5610,34 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
Cost += BlockCost;
}
+#ifndef NDEBUG
+ // TODO: We're effectively having to duplicate the code from
+ // VPInstruction::computeCost, which is ugly. This isn't meant to be a fully
+ // accurate representation of the cost of tail-folding - it exists purely to
+ // stop asserts firing when the legacy cost doesn't match the VPlan cost.
+ if (!VF.isScalar() && foldTailByMasking()) {
+ TailFoldingStyle Style = getTailFoldingStyle();
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ Type *I1Ty = IntegerType::getInt1Ty(Context);
+ Type *IndTy = Legal->getWidestInductionType();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ if (Style == TailFoldingStyle::DataWithEVL) {
+ Type *I32Ty = IntegerType::getInt32Ty(Context);
+ IntrinsicCostAttributes Attrs(
+ Intrinsic::experimental_get_vector_length, I32Ty,
+ {PoisonValue::get(IndTy), PoisonValue::get(I32Ty),
+ PoisonValue::get(I1Ty)});
+ Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind);
+ } else if (useActiveLaneMask(Style)) {
+ VectorType *RetTy = VectorType::get(I1Ty, VF);
+ IntrinsicCostAttributes Attrs(
+ Intrinsic::get_active_lane_mask, RetTy,
+ {PoisonValue::get(IndTy), PoisonValue::get(IndTy)});
+ Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind);
+ }
+ }
+#endif
+
return Cost;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5010bf029d140..297ffa46c8dba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -743,6 +743,21 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
return Ctx.TTI.getArithmeticReductionCost(
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
+ case VPInstruction::ActiveLaneMask: {
+ Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
+ Type *Arg1Ty = Ctx.Types.inferScalarType(getOperand(1));
+ Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
+ IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
+ {Arg0Ty, Arg1Ty});
+ return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+ }
+ case VPInstruction::ExplicitVectorLength: {
+ Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
+ Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
+ IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
+ I32Ty, {I32Ty, I1Ty});
+ return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+ }
default:
// TODO: Compute cost other VPInstructions once the legacy cost model has
// been retired.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index ce24d3cfded22..5ce5df6ad3444 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -613,112 +613,9 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; PRED-LABEL: define i32 @header_mask_and_invariant_compare(
; PRED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; PRED-NEXT: entry:
-; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; PRED: vector.memcheck:
-; PRED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4
-; PRED-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 2
-; PRED-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 4
-; PRED-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP2]]
-; PRED-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 4
-; PRED-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 4
-; PRED-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[C]], i64 4
-; PRED-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[E]], [[SCEVGEP1]]
-; PRED-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[D]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; PRED-NEXT: [[BOUND05:%.*]] = icmp ult ptr [[E]], [[SCEVGEP2]]
-; PRED-NEXT: [[BOUND16:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
-; PRED-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
-; PRED-NEXT: [[BOUND08:%.*]] = icmp ult ptr [[E]], [[SCEVGEP3]]
-; PRED-NEXT: [[BOUND19:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
-; PRED-NEXT: [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
-; PRED-NEXT: [[BOUND012:%.*]] = icmp ult ptr [[E]], [[SCEVGEP4]]
-; PRED-NEXT: [[BOUND113:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; PRED-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
-; PRED-NEXT: [[BOUND016:%.*]] = icmp ult ptr [[D]], [[SCEVGEP2]]
-; PRED-NEXT: [[BOUND117:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
-; PRED-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
-; PRED-NEXT: [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
-; PRED-NEXT: [[BOUND020:%.*]] = icmp ult ptr [[D]], [[SCEVGEP3]]
-; PRED-NEXT: [[BOUND121:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
-; PRED-NEXT: [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
-; PRED-NEXT: [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
-; PRED-NEXT: [[BOUND024:%.*]] = icmp ult ptr [[D]], [[SCEVGEP4]]
-; PRED-NEXT: [[BOUND125:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]]
-; PRED-NEXT: [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
-; PRED-NEXT: [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
-; PRED-NEXT: br i1 [[CONFLICT_RDX27]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; PRED: vector.ph:
-; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT: [[TMP12:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
-; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
-; PRED: vector.body:
-; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE37]] ]
-; PRED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0
-; PRED-NEXT: [[TMP7:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META4:![0-9]+]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP8:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META7:![0-9]+]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP9:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
-; PRED-NEXT: [[TMP10:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META9:![0-9]+]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP11:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP9]]
-; PRED-NEXT: [[TMP25:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
-; PRED-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[D]], i64 [[TMP15]]
-; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP25]], i32 0
-; PRED-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; PRED: pred.store.if:
-; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP9]], i32 0
-; PRED-NEXT: store i32 [[TMP27]], ptr [[E]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE]]
-; PRED: pred.store.continue:
-; PRED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP25]], i32 1
-; PRED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
-; PRED: pred.store.if32:
-; PRED-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP9]], i32 1
-; PRED-NEXT: store i32 [[TMP17]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE33]]
-; PRED: pred.store.continue33:
-; PRED-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP25]], i32 2
-; PRED-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]]
-; PRED: pred.store.if34:
-; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP9]], i32 2
-; PRED-NEXT: store i32 [[TMP19]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE35]]
-; PRED: pred.store.continue35:
-; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP25]], i32 3
-; PRED-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]]
-; PRED: pred.store.if36:
-; PRED-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP9]], i32 3
-; PRED-NEXT: store i32 [[TMP21]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE37]]
-; PRED: pred.store.continue37:
-; PRED-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
-; PRED-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP23]], i32 4, <4 x i1> [[TMP25]]), !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
-; PRED-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP28]], i32 0
-; PRED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; PRED: middle.block:
-; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; PRED: scalar.ph:
-; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; PRED-NEXT: br label [[LOOP_HEADER:%.*]]
; PRED: loop.header:
-; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
; PRED-NEXT: [[L_A:%.*]] = load i32, ptr [[A]], align 4
; PRED-NEXT: [[L_B:%.*]] = load i32, ptr [[B]], align 4
; PRED-NEXT: [[OR:%.*]] = or i32 [[L_B]], [[L_A]]
@@ -733,7 +630,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; PRED: loop.latch:
; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; PRED-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT: br i1 [[C_1]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP18:![0-9]+]]
+; PRED-NEXT: br i1 [[C_1]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
; PRED: exit:
; PRED-NEXT: ret i32 0
;
@@ -848,7 +745,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
-; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; PRED: scalar.ph:
@@ -866,7 +763,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8
; PRED-NEXT: [[IV_CLAMP:%.*]] = and i64 [[IV]], 4294967294
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_CLAMP]], 512
-; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
+; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; PRED: exit:
; PRED-NEXT: ret void
;
@@ -1078,7 +975,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED: pred.store.continue14:
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; PRED-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; PRED-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; PRED: scalar.ph:
@@ -1091,7 +988,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
-; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
+; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; PRED: exit:
; PRED-NEXT: ret void
;
@@ -1116,7 +1013,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-SAME: ptr noalias [[SRC_1:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[SRC_3:%.*]], ptr noalias [[SRC_4:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; DEFAULT: vector.scevcheck:
; DEFAULT-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
@@ -1358,7 +1255,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW7]]
; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP4]], [[TMP8]]
; PRED-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
-; PRED-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[ENTRY:%.*]]
; PRED: vector.ph:
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
@@ -1367,16 +1264,16 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[TMP0]], 8
; PRED-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
+; PRED-NEXT: br label [[LOOP_HEADER:%.*]]
; PRED: vector.body:
-; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE27:%.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
-; PRED-NEXT: [[TMP18:%.*]] = load float, ptr [[SRC_1]], align 4
-; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
+; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[LOOP_LATCH]] ]
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_LATCH]] ]
+; PRED-NEXT: [[TMP86:%.*]] = load float, ptr [[SRC_1]], align 4
+; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP86]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
-; PRED-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_2]], align 4
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; PRED-NEXT: [[TMP87:%.*]] = load float, ptr [[SRC_2]], align 4
+; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP87]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
; PRED-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
; PRED-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP20]])
@@ -1496,7 +1393,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: br label [[PRED_STORE_CONTINUE25]]
; PRED: pred.store.continue25:
; PRED-NEXT: [[TMP77:%.*]] = extractelement <8 x i1> [[TMP26]], i32 7
-; PRED-NEXT: br i1 [[TMP77]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27]]
+; PRED-NEXT: br i1 [[TMP77]], label [[PRED_STORE_IF26:%.*]], label [[LOOP_LATCH]]
; PRED: pred.store.if26:
; PRED-NEXT: [[TMP78:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP78]], align 4
@@ -1508,32 +1405,32 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP82]], align 4
; PRED-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP83]], align 4
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE27]]
+; PRED-NEXT: br label [[LOOP_LATCH]]
; PRED: pred.store.continue27:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP17]])
+; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8
+; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[IV]], i64 [[TMP17]])
; PRED-NEXT: [[TMP84:%.*]] = xor <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64...
[truncated]
|
In terms of performance, I ran SPEC2017 on neoverse-v1 and I observed a minor 0.8% regression in x264. This is because we now prefer to choose a larger VF in a hot loop which makes sense in general, but the loop has low trip counts so ends up being less efficient. I can personally live with this, since I believe this is still the right thing to do for the majority of loops. In the case of x264 what happens now is that the cost for VF=vscale x 8 ends up the same as for VF=vscale x 16, and the loop vectoriser favours larger VFs in a tie. We could add a flag to the vectoriser to prefer lower VFs if necessary. |
@@ -1578,7 +1475,7 @@ if.then: | |||
loop.latch: | |||
%iv.next = add i64 %iv, 1 | |||
%ec = icmp eq i64 %iv, %N | |||
br i1 %ec, label %exit, label %loop.header | |||
br i1 %ec, label %exit, label %loop.header, !llvm.loop !1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is required to keep the test the same as before.
Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx); | ||
Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx); | ||
IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, | ||
I32Ty, {I32Ty, I1Ty}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be {I32Ty, I32Ty , I1Ty}
to match the intrinsic definition?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good spot! Fixed, thanks.
IntrinsicCostAttributes Attrs( | ||
Intrinsic::experimental_get_vector_length, I32Ty, | ||
{PoisonValue::get(IndTy), PoisonValue::get(I32Ty), | ||
PoisonValue::get(I1Ty)}); | ||
Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind); | ||
} else if (useActiveLaneMask(Style)) { | ||
VectorType *RetTy = VectorType::get(I1Ty, VF); | ||
IntrinsicCostAttributes Attrs( | ||
Intrinsic::get_active_lane_mask, RetTy, | ||
{PoisonValue::get(IndTy), PoisonValue::get(IndTy)}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to use the "types only" variant here like below? I know this is DEBUG only but it seems wasteful to create a bunch of redundant poison values.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, I've updated it!
; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] | ||
; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ] | ||
; TFFALLBACK-NEXT: [[TMP4:%.*]] = load double, ptr [[P2:%.*]], align 8 | ||
; TFFALLBACK-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we lose test coverage for widening/scalarizing exp here or is this also covered somewhere else?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. I've added a new masked-call-scalarize.ll file that contains the same test with VF forced to 2.
@@ -78,11 +78,16 @@ loop: | |||
store i64 0, ptr %arrayidx13, align 8 | |||
%iv.next = add nuw nsw i64 %iv, 1 | |||
%exitcond.not = icmp eq i64 %iv.next, 3 | |||
br i1 %exitcond.not, label %exit, label %loop | |||
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: !llvm.loop !0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done!
!1 = distinct !{!1, !2, !3, !4} | ||
!2 = !{!"llvm.loop.vectorize.width", i32 2} | ||
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} | ||
!4 = !{!"llvm.loop.vectorize.enable", i1 true} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
!1 = distinct !{!1, !2, !3, !4} | |
!2 = !{!"llvm.loop.vectorize.width", i32 2} | |
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} | |
!4 = !{!"llvm.loop.vectorize.enable", i1 true} | |
!0 = distinct !{!0, !1, !2, !3} | |
!1 = !{!"llvm.loop.vectorize.width", i32 2} | |
!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} | |
!3 = !{!"llvm.loop.vectorize.enable", i1 true} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done!
@@ -32,35 +32,35 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) { | |||
; CHECK: [[VECTOR_BODY]]: | |||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] | |||
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] | |||
; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP0]], [[EVL_BASED_IV]] | |||
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 2, i1 true) | |||
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[TMP0]], [[EVL_BASED_IV]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like there are no changes except for variable names in vectorize-force-tail-with-evl-uniform-store.ll. Would it be possible to update only the metadata without modifying the IR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done!
@@ -482,10 +482,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca | |||
; TF-FIXEDLEN: vector.ph: | |||
; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0 | |||
; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer | |||
; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] | |||
; TF-FIXEDLEN-NEXT: br label [[FOR_BODY:%.*]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we still use VECTOR_BODY instead of FOR_BODY?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've noticed some really odd behaviour with update_test_checks.py recently where labels get completely incorrect names! I'll see if I can just manually fix this, although that does mean it's not autogenerated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I regenerated the tests in a previous patch so hopefully these problems have gone away now. Good spot!
case VPInstruction::ActiveLaneMask: { | ||
Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0)); | ||
Type *Arg1Ty = Ctx.Types.inferScalarType(getOperand(1)); | ||
Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC, llvm.get.active.lane.mask
only returns a VF x i1
type.
Could we use getInt1Ty
directly here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, you're right. Done!
Sorry, left a comment on the wrong PR!! |
Regenerates CHECK lines in tests that are affected by PR llvm#130565 to aid reviews.
Regenerates CHECK lines in tests that are affected by PR #130565 to aid reviews.
At the moment if we decide to enable tail-folding we do not include the cost of generating the mask per VF. This can mean we make some poor choices of VF, which is definitely true for SVE-enabled AArch64 targets where mask generation for fixed-width vectors is more expensive than for scalable vectors. I've added a VPInstruction::computeCost function to return the costs of the ActiveLaneMask and ExplicitVectorLength operations. Unfortunately, in order to prevent asserts firing I've also had to duplicate the same code in the legacy cost model to make sure the chosen VFs match up. I've wrapped this up in a ifndef NDEBUG for now. New tests added: Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One observation but otherwise this looks good to me.
Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0)); | ||
Type *Arg1Ty = Ctx.Types.inferScalarType(getOperand(1)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given get.active.lane.mask
requires both parameters to be the same type, do we need to calculate Arg1Ty
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPTypeAnalysis should already assert that both types are the same, so would probably be good to remove here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0)); | ||
Type *Arg1Ty = Ctx.Types.inferScalarType(getOperand(1)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPTypeAnalysis should already assert that both types are the same, so would probably be good to remove here.
TailFoldingStyle Style = getTailFoldingStyle(); | ||
LLVMContext &Context = TheLoop->getHeader()->getContext(); | ||
Type *I1Ty = IntegerType::getInt1Ty(Context); | ||
Type *IndTy = Legal->getWidestInductionType(); | ||
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||
if (Style == TailFoldingStyle::DataWithEVL) { | ||
Type *I32Ty = IntegerType::getInt32Ty(Context); | ||
IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, | ||
I32Ty, {IndTy, I32Ty, I1Ty}); | ||
Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind); | ||
} else if (useActiveLaneMask(Style)) { | ||
VectorType *RetTy = VectorType::get(I1Ty, VF); | ||
IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, | ||
{IndTy, IndTy}); | ||
Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am wondering if it would be cleaner to handle this in the caller of expectedCost
, where we have the VPlans available. With that, we could just iterate over all recipes in the loop region and compute the costs for ActiveLaneMask/EVL using the VPlan-based cost model and add them to the cost returned by expectedCost
?
This might be more scalable for future use-cases.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair enough!
// cost model. | ||
VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(), | ||
CM, CM.CostKind); | ||
if (VPRegionBlock *VectorRegion = P->getVectorLoopRegion()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
at this point, I think there should always be a vector loop region?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
for (VPBlockBase *Block : | ||
vp_depth_first_shallow(VectorRegion->getEntry())) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could be slightly simplified using blocksOnly
:
for (VPBlockBase *Block : | |
vp_depth_first_shallow(VectorRegion->getEntry())) { | |
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(VectorRegion->getEntry())) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
if (!isa<VPBasicBlock>(Block)) | ||
continue; | ||
for (VPRecipeBase &R : *cast<VPBasicBlock>(Block)) { | ||
if (auto *VPI = dyn_cast<VPInstruction>(&R)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
might be worth
if (auto *VPI = dyn_cast<VPInstruction>(&R)) { | |
auto *VPI = dyn_cast<VPInstruction>(&R); | |
if (!VPI) | |
continue |
to reduce the already high nesting level slightly
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
At the moment if we decide to enable tail-folding we do not include
the cost of generating the mask per VF. This can mean we make some
poor choices of VF, which is definitely true for SVE-enabled AArch64
targets where mask generation for fixed-width vectors is more
expensive than for scalable vectors.
I've added a VPInstruction::computeCost function to return the costs
of the ActiveLaneMask and ExplicitVectorLength operations.
Unfortunately, in order to prevent asserts firing I've also had to
duplicate the same code in the legacy cost model to make sure the
chosen VFs match up. I've wrapped this up in a ifndef NDEBUG for
now. The alternative would be to disable the assert completely when
tail-folding, which I imagine is just as bad.
New tests added:
Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
Transforms/LoopVectorize/RISCV/tail-folding-cost.ll