-
Notifications
You must be signed in to change notification settings - Fork 13.6k
release/20.x: [AArch64] Disallow vscale x 1 partial reductions (#125252) #125866
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
We don't want to allow partial reductions resulting in a vscale x 1 type as we can't lower it in the backend. (cherry picked from commit c7995a6)
@david-arm What do you think about merging this PR to the release branch? |
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: None (llvmbot) ChangesBackport c7995a6 Requested by: @SamTebbs33 Patch is 23.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125866.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index aae2fdaf5bec37b..4af3c482e65984b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4682,13 +4682,24 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
EVT InputEVT = EVT::getEVT(InputTypeA);
EVT AccumEVT = EVT::getEVT(AccumType);
- if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
- return Invalid;
+ unsigned VFMinValue = VF.getKnownMinValue();
+
+ if (VF.isScalable()) {
+ if (!ST->isSVEorStreamingSVEAvailable())
+ return Invalid;
+
+ // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
+ // since we can't lower that type.
+ unsigned Scale =
+ AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
+ if (VFMinValue == Scale)
+ return Invalid;
+ }
if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
return Invalid;
if (InputEVT == MVT::i8) {
- switch (VF.getKnownMinValue()) {
+ switch (VFMinValue) {
default:
return Invalid;
case 8:
@@ -4707,7 +4718,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
} else if (InputEVT == MVT::i16) {
// FIXME: Allow i32 accumulator but increase cost, as we would extend
// it to i64.
- if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
+ if (VFMinValue != 8 || AccumEVT != MVT::i64)
return Invalid;
} else
return Invalid;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index a0214ae88c2d6ed..9d0d30abce6c933 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -2434,7 +2434,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
@@ -2446,12 +2446,12 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i64> @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64(<vscale x 1 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP14]])
+; CHECK-MAXBW-NEXT: [[TMP19]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP14]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP19]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
@@ -3106,89 +3106,188 @@ exit: ; preds = %for.cond.cleanup.loopexit, %ent
ret i64 %result
}
+define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
+; CHECK-INTERLEAVE1-LABEL: define dso_local i32 @not_dotp_vscale1(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-INTERLEAVE1: for.body.preheader:
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP16]], [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = add <vscale x 2 x i64> [[TMP17]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP18]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define dso_local i32 @not_dotp_vscale1(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-INTERLEAVED: for.body.preheader:
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP24]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add <vscale x 2 x i64> [[TMP26]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP28]], [[TMP27]]
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define dso_local i32 @not_dotp_vscale1(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-MAXBW: for.body.preheader:
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = insertelement <vscale x 8 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP16]], [[TMP14]]
+; CHECK-MAXBW-NEXT: [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW: middle.block:
+; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]])
+; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+ %cmp = icmp sgt i32 %n, 0
+ br i1 %cmp, label %for.body, label %exit
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+ %accum = phi i64 [ %add, %for.body ], [ %cost, %entry ]
+ %a.ptr = phi ptr [ %a.gep, %for.body ], [ %a, %entry ]
+ %b.ptr = phi ptr [ %b.gep, %for.body ], [ %b, %entry ]
+ %a.load = load i8, ptr %a.ptr, align 1
+ %a.ext = zext i8 %a.load to i64
+ %b.load = load i8, ptr %b.ptr, align 1
+ %b.ext = zext i8 %b.load to i64
+ %mul = mul nuw nsw i64 %b.ext, %a.ext
+ %add = add nsw i64 %mul, %accum
+ %a.gep = getelementptr inbounds nuw i8, ptr %a.ptr, i64 1
+ %b.gep = getelementptr inbounds nuw i8, ptr %b.ptr, i64 1
+ %iv.next = add nuw nsw i32 %iv, 1
+ %cmp.2 = icmp eq i32 %iv.next, %n
+ br i1 %cmp.2, label %exit, label %for.body
+
+exit: ; preds = %for.cond.cleanup.loopexit, %entry
+ %cost.result = phi i64 [ %cost, %entry ], [ %add, %for.body ]
+ %result = trunc i64 %cost.result to i32
+ ret i32 %result
+}
+
!7 = distinct !{!7, !8, !9, !10}
!8 = !{!"llvm.loop.mustprogress"}
!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
!10 = !{!"llvm.loop.vectorize.enable", i1 true}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
-;.
-; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
-;.
-; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This patch is simply making the vectoriser more conservative so that we vectorise fewer problematic loops. It's an important fix and it seems low risk to me.
@SamTebbs33 (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. |
Backport c7995a6
Requested by: @SamTebbs33