-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AArch64] Disallow vscale x 1 partial reductions #125252
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
We don't want to allow partial reductions resulting in a vscale x 1 type as we can't lower it in the backend.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: Sam Tebbs (SamTebbs33) ChangesWe don't want to allow partial reductions resulting in a vscale x 1 type as we can't lower it in the backend. Patch is 23.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125252.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cd093317275ee9a..69ea45fcb252913 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4687,13 +4687,24 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
EVT InputEVT = EVT::getEVT(InputTypeA);
EVT AccumEVT = EVT::getEVT(AccumType);
- if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
- return Invalid;
+ unsigned VFMinValue = VF.getKnownMinValue();
+
+ if (VF.isScalable()) {
+ if (!ST->isSVEorStreamingSVEAvailable())
+ return Invalid;
+
+ // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
+ // since we can't lower that type.
+ unsigned Scale =
+ AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
+ if (VFMinValue / Scale == 1)
+ return Invalid;
+ }
if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
return Invalid;
if (InputEVT == MVT::i8) {
- switch (VF.getKnownMinValue()) {
+ switch (VFMinValue) {
default:
return Invalid;
case 8:
@@ -4712,7 +4723,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
} else if (InputEVT == MVT::i16) {
// FIXME: Allow i32 accumulator but increase cost, as we would extend
// it to i64.
- if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
+ if (VFMinValue != 8 || AccumEVT != MVT::i64)
return Invalid;
} else
return Invalid;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index a0214ae88c2d6ed..9d0d30abce6c933 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -2434,7 +2434,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
@@ -2446,12 +2446,12 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i64> @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64(<vscale x 1 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP14]])
+; CHECK-MAXBW-NEXT: [[TMP19]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP14]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP19]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
@@ -3106,89 +3106,188 @@ exit: ; preds = %for.cond.cleanup.loopexit, %ent
ret i64 %result
}
+define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
+; CHECK-INTERLEAVE1-LABEL: define dso_local i32 @not_dotp_vscale1(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-INTERLEAVE1: for.body.preheader:
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP16]], [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = add <vscale x 2 x i64> [[TMP17]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP18]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define dso_local i32 @not_dotp_vscale1(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-INTERLEAVED: for.body.preheader:
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP24]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add <vscale x 2 x i64> [[TMP26]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP28]], [[TMP27]]
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define dso_local i32 @not_dotp_vscale1(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-MAXBW: for.body.preheader:
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = insertelement <vscale x 8 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP16]], [[TMP14]]
+; CHECK-MAXBW-NEXT: [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW: middle.block:
+; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]])
+; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+ %cmp = icmp sgt i32 %n, 0
+ br i1 %cmp, label %for.body, label %exit
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+ %accum = phi i64 [ %add, %for.body ], [ %cost, %entry ]
+ %a.ptr = phi ptr [ %a.gep, %for.body ], [ %a, %entry ]
+ %b.ptr = phi ptr [ %b.gep, %for.body ], [ %b, %entry ]
+ %a.load = load i8, ptr %a.ptr, align 1
+ %a.ext = zext i8 %a.load to i64
+ %b.load = load i8, ptr %b.ptr, align 1
+ %b.ext = zext i8 %b.load to i64
+ %mul = mul nuw nsw i64 %b.ext, %a.ext
+ %add = add nsw i64 %mul, %accum
+ %a.gep = getelementptr inbounds nuw i8, ptr %a.ptr, i64 1
+ %b.gep = getelementptr inbounds nuw i8, ptr %b.ptr, i64 1
+ %iv.next = add nuw nsw i32 %iv, 1
+ %cmp.2 = icmp eq i32 %iv.next, %n
+ br i1 %cmp.2, label %exit, label %for.body
+
+exit: ; preds = %for.cond.cleanup.loopexit, %entry
+ %cost.result = phi i64 [ %cost, %entry ], [ %add, %for.body ]
+ %result = trunc i64 %cost.result to i32
+ ret i32 %result
+}
+
!7 = distinct !{!7, !8, !9, !10}
!8 = !{!"llvm.loop.mustprogress"}
!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
!10 = !{!"llvm.loop.vectorize.enable", i1 true}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
-;.
-; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
-;.
-; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META...
[truncated]
|
// since we can't lower that type. | ||
unsigned Scale = | ||
AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits(); | ||
if (VFMinValue / Scale == 1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this the same as:
if (VFMinValue / Scale == 1) | |
if (VFMinValue == Scale) |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is! Fixed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, I don't see any update here with the change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm I did push it and according to my local branch it's on my remote branch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!
/cherry-pick c7995a6 |
Error: Command failed due to missing milestone. |
@SamTebbs33 run the command again now :) |
/cherry-pick c7995a6 |
/pull-request #125866 |
We don't want to allow partial reductions resulting in a vscale x 1 type as we can't lower it in the backend. (cherry picked from commit c7995a6)
We don't want to allow partial reductions resulting in a vscale x 1 type as we can't lower it in the backend.
We don't want to allow partial reductions resulting in a vscale x 1 type as we can't lower it in the backend.