Skip to content

Commit 38e76dc

Browse files
[LoopVectorizer] Allow partial reductions to be made in predicated
loops Does a select on the input rather than the output. This way the mask has the same number of lanes as the other operand in the select instruction.
1 parent a5cc897 commit 38e76dc

File tree

5 files changed

+58
-43
lines changed

5 files changed

+58
-43
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8721,13 +8721,6 @@ std::optional<std::pair<PartialReductionChain, unsigned>>
87218721
VPRecipeBuilder::getScaledReduction(PHINode *PHI,
87228722
const RecurrenceDescriptor &Rdx,
87238723
VFRange &Range) {
8724-
// TODO: Allow scaling reductions when predicating. The select at
8725-
// the end of the loop chooses between the phi value and most recent
8726-
// reduction result, both of which have different VFs to the active lane
8727-
// mask when scaling.
8728-
if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
8729-
return std::nullopt;
8730-
87318724
auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
87328725
if (!Update)
87338726
return std::nullopt;
@@ -8875,7 +8868,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
88758868
if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
88768869
std::swap(BinOp, Phi);
88778870

8878-
return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
8871+
VPValue *Mask = getBlockInMask(Reduction->getParent());
8872+
8873+
return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, Mask,
88798874
Reduction);
88808875
}
88818876

@@ -9712,8 +9707,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
97129707
PhiTy->isFloatingPointTy()
97139708
? std::make_optional(RdxDesc.getFastMathFlags())
97149709
: std::nullopt;
9715-
NewExitingVPV =
9716-
Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9710+
if (!isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe()))
9711+
NewExitingVPV =
9712+
Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
97179713
OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
97189714
return isa<VPInstruction>(&U) &&
97199715
cast<VPInstruction>(&U)->getOpcode() ==

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2442,28 +2442,41 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
24422442
/// A recipe for forming partial reductions. In the loop, an accumulator and
24432443
/// vector operand are added together and passed to the next iteration as the
24442444
/// next accumulator. After the loop body, the accumulator is reduced to a
2445-
/// scalar value.
2445+
/// scalar value. If the mask operand is not nullptr then it is applied to the
2446+
/// vector operand on each iteration.
24462447
class VPPartialReductionRecipe : public VPSingleDefRecipe {
24472448
unsigned Opcode;
24482449

24492450
public:
24502451
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
2451-
VPValue *Op1)
2452-
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
2452+
VPValue *Op1, VPValue *Mask = nullptr)
2453+
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Mask,
24532454
ReductionInst) {}
24542455
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
2456+
VPValue *Mask = nullptr,
24552457
Instruction *ReductionInst = nullptr)
24562458
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
24572459
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
24582460
Opcode(Opcode) {
24592461
assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
24602462
"Unexpected operand order for partial reduction recipe");
2463+
if (Mask)
2464+
addOperand(Mask);
24612465
}
24622466
~VPPartialReductionRecipe() override = default;
24632467

24642468
VPPartialReductionRecipe *clone() override {
2465-
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
2466-
getUnderlyingInstr());
2469+
return getNumOperands() == 3
2470+
? new VPPartialReductionRecipe(Opcode, getOperand(0),
2471+
getOperand(1), getOperand(2),
2472+
getUnderlyingInstr())
2473+
: new VPPartialReductionRecipe(Opcode, getOperand(0),
2474+
getOperand(1), nullptr,
2475+
getUnderlyingInstr());
2476+
}
2477+
2478+
VPValue *getMask() const {
2479+
return getNumOperands() == 3 ? getOperand(2) : nullptr;
24672480
}
24682481

24692482
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,12 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
326326

327327
Type *RetTy = PhiVal->getType();
328328

329+
VPValue *Mask = getMask();
330+
if (Mask) {
331+
Value *MaskVal = State.get(Mask);
332+
Value *Zero = ConstantInt::get(BinOpVal->getType(), 0);
333+
BinOpVal = Builder.CreateSelect(MaskVal, BinOpVal, Zero);
334+
}
329335
CallInst *V = Builder.CreateIntrinsic(
330336
RetTy, Intrinsic::experimental_vector_partial_reduce_add,
331337
{PhiVal, BinOpVal}, nullptr, "partial.reduce");

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1596,7 +1596,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
15961596
; CHECK-INTERLEAVE1: vector.body:
15971597
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
15981598
; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
1599-
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
1599+
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
16001600
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
16011601
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
16021602
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -1905,14 +1905,14 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
19051905
; CHECK-INTERLEAVE1-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
19061906
; CHECK-INTERLEAVE1-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
19071907
; CHECK-INTERLEAVE1-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
1908-
; CHECK-INTERLEAVE1-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]]
1909-
; CHECK-INTERLEAVE1-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]]
1908+
; CHECK-INTERLEAVE1-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
1909+
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
19101910
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
19111911
; CHECK-INTERLEAVE1-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
19121912
; CHECK-INTERLEAVE1-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
19131913
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
19141914
; CHECK-INTERLEAVE1: middle.block:
1915-
; CHECK-INTERLEAVE1-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]])
1915+
; CHECK-INTERLEAVE1-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
19161916
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
19171917
; CHECK-INTERLEAVE1: scalar.ph:
19181918
; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -1951,7 +1951,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
19511951
; CHECK-INTERLEAVED: vector.body:
19521952
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
19531953
; CHECK-INTERLEAVED-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
1954-
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
1954+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
19551955
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
19561956
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
19571957
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -2260,14 +2260,14 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
22602260
; CHECK-INTERLEAVED-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
22612261
; CHECK-INTERLEAVED-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
22622262
; CHECK-INTERLEAVED-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
2263-
; CHECK-INTERLEAVED-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]]
2264-
; CHECK-INTERLEAVED-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]]
2263+
; CHECK-INTERLEAVED-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
2264+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
22652265
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
22662266
; CHECK-INTERLEAVED-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
22672267
; CHECK-INTERLEAVED-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
22682268
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
22692269
; CHECK-INTERLEAVED: middle.block:
2270-
; CHECK-INTERLEAVED-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]])
2270+
; CHECK-INTERLEAVED-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
22712271
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
22722272
; CHECK-INTERLEAVED: scalar.ph:
22732273
; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -2306,7 +2306,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
23062306
; CHECK-MAXBW: vector.body:
23072307
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
23082308
; CHECK-MAXBW-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
2309-
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
2309+
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
23102310
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
23112311
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
23122312
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -2615,14 +2615,14 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
26152615
; CHECK-MAXBW-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
26162616
; CHECK-MAXBW-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
26172617
; CHECK-MAXBW-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
2618-
; CHECK-MAXBW-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]]
2619-
; CHECK-MAXBW-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]]
2618+
; CHECK-MAXBW-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
2619+
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
26202620
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
26212621
; CHECK-MAXBW-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
26222622
; CHECK-MAXBW-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
26232623
; CHECK-MAXBW-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
26242624
; CHECK-MAXBW: middle.block:
2625-
; CHECK-MAXBW-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]])
2625+
; CHECK-MAXBW-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
26262626
; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
26272627
; CHECK-MAXBW: scalar.ph:
26282628
; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1975,41 +1975,41 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
19751975
; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
19761976
; CHECK-MAXBW: vector.ph:
19771977
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
1978-
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
1978+
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
19791979
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
19801980
; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
19811981
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
19821982
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
19831983
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
1984-
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
1984+
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16
19851985
; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
1986-
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
1986+
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
19871987
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
19881988
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
19891989
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
1990-
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
1990+
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
19911991
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
19921992
; CHECK-MAXBW: vector.body:
19931993
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1994-
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
1994+
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
19951995
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
19961996
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
19971997
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
19981998
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
1999-
; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
2000-
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
1999+
; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
2000+
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
20012001
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
20022002
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
2003-
; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
2004-
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
2005-
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
2006-
; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
2007-
; CHECK-MAXBW-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
2003+
; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP15]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
2004+
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
2005+
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
2006+
; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
2007+
; CHECK-MAXBW-NEXT: [[TMP19]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
20082008
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
2009-
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
2010-
; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
2011-
; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
2012-
; CHECK-MAXBW-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
2009+
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
2010+
; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
2011+
; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = extractelement <vscale x 16 x i1> [[TMP21]], i32 0
2012+
; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
20132013
; CHECK-MAXBW: middle.block:
20142014
; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
20152015
; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]

0 commit comments

Comments
 (0)