Skip to content

Commit 98487f0

Browse files
ElvisWang123google-yfyang
authored andcommitted
[VPlan] Implement VPlan-based cost model for VPReduction, VPExtendedReduction and VPMulAccumulateReduction. (llvm#113903)
This patch implement the VPlan-based cost model for VPReduction, VPExtendedReduction and VPMulAccumulateReduction. With this patch, we can calculate the reduction cost by the VPlan-based cost model so remove the reduction costs in `precomputeCost()`. Ref: Original instruction based implementation: https://reviews.llvm.org/D93476
1 parent 475ddab commit 98487f0

File tree

7 files changed

+102
-120
lines changed

7 files changed

+102
-120
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -7192,62 +7192,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
71927192
}
71937193
}
71947194

7195-
// The legacy cost model has special logic to compute the cost of in-loop
7196-
// reductions, which may be smaller than the sum of all instructions involved
7197-
// in the reduction.
7198-
// TODO: Switch to costing based on VPlan once the logic has been ported.
7199-
for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7200-
if (ForceTargetInstructionCost.getNumOccurrences())
7201-
continue;
7202-
7203-
if (!CM.isInLoopReduction(RedPhi))
7204-
continue;
7205-
7206-
const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7207-
SetVector<Instruction *> ChainOpsAndOperands(llvm::from_range, ChainOps);
7208-
auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7209-
return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7210-
};
7211-
// Also include the operands of instructions in the chain, as the cost-model
7212-
// may mark extends as free.
7213-
//
7214-
// For ARM, some of the instruction can folded into the reducion
7215-
// instruction. So we need to mark all folded instructions free.
7216-
// For example: We can fold reduce(mul(ext(A), ext(B))) into one
7217-
// instruction.
7218-
for (auto *ChainOp : ChainOps) {
7219-
for (Value *Op : ChainOp->operands()) {
7220-
if (auto *I = dyn_cast<Instruction>(Op)) {
7221-
ChainOpsAndOperands.insert(I);
7222-
if (I->getOpcode() == Instruction::Mul) {
7223-
auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7224-
auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7225-
if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7226-
Ext0->getOpcode() == Ext1->getOpcode()) {
7227-
ChainOpsAndOperands.insert(Ext0);
7228-
ChainOpsAndOperands.insert(Ext1);
7229-
}
7230-
}
7231-
}
7232-
}
7233-
}
7234-
7235-
// Pre-compute the cost for I, if it has a reduction pattern cost.
7236-
for (Instruction *I : ChainOpsAndOperands) {
7237-
auto ReductionCost =
7238-
CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
7239-
if (!ReductionCost)
7240-
continue;
7241-
7242-
assert(!CostCtx.SkipCostComputation.contains(I) &&
7243-
"reduction op visited multiple times");
7244-
CostCtx.SkipCostComputation.insert(I);
7245-
LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7246-
<< ":\n in-loop reduction " << *I << "\n");
7247-
Cost += *ReductionCost;
7248-
}
7249-
}
7250-
72517195
// Pre-compute the costs for branches except for the backedge, as the number
72527196
// of replicate regions in a VPlan may not directly match the number of
72537197
// branches, which would lead to different decisions.

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2645,6 +2645,10 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
26452645
"VPExtendedRecipe + VPReductionRecipe before execution.");
26462646
};
26472647

2648+
/// Return the cost of VPExtendedReductionRecipe.
2649+
InstructionCost computeCost(ElementCount VF,
2650+
VPCostContext &Ctx) const override;
2651+
26482652
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
26492653
/// Print the recipe.
26502654
void print(raw_ostream &O, const Twine &Indent,
@@ -2744,6 +2748,10 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
27442748
"VPWidenRecipe + VPReductionRecipe before execution");
27452749
}
27462750

2751+
/// Return the cost of VPMulAccumulateReductionRecipe.
2752+
InstructionCost computeCost(ElementCount VF,
2753+
VPCostContext &Ctx) const override;
2754+
27472755
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
27482756
/// Print the recipe.
27492757
void print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -761,19 +761,24 @@ Value *VPInstruction::generate(VPTransformState &State) {
761761
InstructionCost VPInstruction::computeCost(ElementCount VF,
762762
VPCostContext &Ctx) const {
763763
if (Instruction::isBinaryOp(getOpcode())) {
764+
Type *ResTy = Ctx.Types.inferScalarType(this);
765+
if (!vputils::onlyFirstLaneUsed(this))
766+
ResTy = toVectorTy(ResTy, VF);
767+
764768
if (!getUnderlyingValue()) {
765-
// TODO: Compute cost for VPInstructions without underlying values once
766-
// the legacy cost model has been retired.
767-
return 0;
769+
switch (getOpcode()) {
770+
case Instruction::FMul:
771+
return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
772+
default:
773+
// TODO: Compute cost for VPInstructions without underlying values once
774+
// the legacy cost model has been retired.
775+
return 0;
776+
}
768777
}
769778

770779
assert(!doesGeneratePerAllLanes() &&
771780
"Should only generate a vector value or single scalar, not scalars "
772781
"for all lanes.");
773-
Type *ResTy = Ctx.Types.inferScalarType(this);
774-
if (!vputils::onlyFirstLaneUsed(this))
775-
ResTy = toVectorTy(ResTy, VF);
776-
777782
return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
778783
}
779784

@@ -2520,24 +2525,49 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
25202525
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
25212526
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
25222527
FastMathFlags FMFs = getFastMathFlags();
2528+
std::optional<FastMathFlags> OptionalFMF =
2529+
ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
25232530

25242531
// TODO: Support any-of reductions.
25252532
assert(
25262533
(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
25272534
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
25282535
"Any-of reduction not implemented in VPlan-based cost model currently.");
25292536

2530-
// Cost = Reduction cost + BinOp cost
2531-
InstructionCost Cost =
2532-
Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
2537+
// Note that TTI should model the cost of moving result to the scalar register
2538+
// and the BinOp cost in the getMinMaxReductionCost().
25332539
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
25342540
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
2535-
return Cost +
2536-
Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2541+
return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
25372542
}
25382543

2539-
return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs,
2540-
Ctx.CostKind);
2544+
// Note that TTI should model the cost of moving result to the scalar register
2545+
// and the BinOp cost in the getArithmeticReductionCost().
2546+
return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2547+
Ctx.CostKind);
2548+
}
2549+
2550+
InstructionCost
2551+
VPExtendedReductionRecipe::computeCost(ElementCount VF,
2552+
VPCostContext &Ctx) const {
2553+
unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind());
2554+
Type *RedTy = Ctx.Types.inferScalarType(this);
2555+
auto *SrcVecTy =
2556+
cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF));
2557+
assert(RedTy->isIntegerTy() &&
2558+
"ExtendedReduction only support integer type currently.");
2559+
return Ctx.TTI.getExtendedReductionCost(Opcode, isZExt(), RedTy, SrcVecTy,
2560+
std::nullopt, Ctx.CostKind);
2561+
}
2562+
2563+
InstructionCost
2564+
VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
2565+
VPCostContext &Ctx) const {
2566+
Type *RedTy = Ctx.Types.inferScalarType(this);
2567+
auto *SrcVecTy =
2568+
cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF));
2569+
return Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy,
2570+
Ctx.CostKind);
25412571
}
25422572

25432573
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@ target triple="aarch64-unknown-linux-gnu"
1212

1313
; CHECK-VSCALE2-LABEL: LV: Checking a loop in 'fadd_strict32'
1414
; CHECK-VSCALE2: Cost of 4 for VF vscale x 2:
15-
; CHECK-VSCALE2: in-loop reduction %add = fadd float %0, %sum.07
15+
; CHECK-VSCALE2: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>)
1616
; CHECK-VSCALE2: Cost of 8 for VF vscale x 4:
17-
; CHECK-VSCALE2: in-loop reduction %add = fadd float %0, %sum.07
17+
; CHECK-VSCALE2: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>)
1818
; CHECK-VSCALE1-LABEL: LV: Checking a loop in 'fadd_strict32'
1919
; CHECK-VSCALE1: Cost of 2 for VF vscale x 2:
20-
; CHECK-VSCALE1: in-loop reduction %add = fadd float %0, %sum.07
20+
; CHECK-VSCALE1: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>)
2121
; CHECK-VSCALE1: Cost of 4 for VF vscale x 4:
22-
; CHECK-VSCALE1: in-loop reduction %add = fadd float %0, %sum.07
22+
; CHECK-VSCALE1: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>)
2323

2424
define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 {
2525
entry:
@@ -42,10 +42,10 @@ for.end:
4242

4343
; CHECK-VSCALE2-LABEL: LV: Checking a loop in 'fadd_strict64'
4444
; CHECK-VSCALE2: Cost of 4 for VF vscale x 2:
45-
; CHECK-VSCALE2: in-loop reduction %add = fadd double %0, %sum.07
45+
; CHECK-VSCALE2: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>)
4646
; CHECK-VSCALE1-LABEL: LV: Checking a loop in 'fadd_strict64'
4747
; CHECK-VSCALE1: Cost of 2 for VF vscale x 2:
48-
; CHECK-VSCALE1: in-loop reduction %add = fadd double %0, %sum.07
48+
; CHECK-VSCALE1: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>)
4949

5050
define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 {
5151
entry:

llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -800,11 +800,11 @@ define i32 @mla_i32_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3
800800
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
801801
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
802802
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
803-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
804-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
805803
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[INDEX]]
806804
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
807-
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
805+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Y1:%.*]], i32 [[INDEX]]
806+
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
807+
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
808808
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
809809
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
810810
; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
@@ -961,11 +961,11 @@ define signext i16 @mla_i16_i16(ptr nocapture readonly %x, ptr nocapture readonl
961961
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
962962
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
963963
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
964-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
965-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
966964
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
967965
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
968-
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
966+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[Y1:%.*]], i32 [[INDEX]]
967+
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP7]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
968+
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
969969
; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP2]], <8 x i16> zeroinitializer
970970
; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]])
971971
; CHECK-NEXT: [[TMP5]] = add i16 [[TMP4]], [[VEC_PHI]]
@@ -1067,11 +1067,11 @@ define zeroext i8 @mla_i8_i8(ptr nocapture readonly %x, ptr nocapture readonly %
10671067
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
10681068
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
10691069
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
1070-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
1071-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
10721070
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
10731071
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
1074-
; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
1072+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Y1:%.*]], i32 [[INDEX]]
1073+
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
1074+
; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
10751075
; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> zeroinitializer
10761076
; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP3]])
10771077
; CHECK-NEXT: [[TMP5]] = add i8 [[TMP4]], [[VEC_PHI]]
@@ -1181,11 +1181,11 @@ define i64 @red_mla_ext_s16_u16_s64(ptr noalias nocapture readonly %A, ptr noali
11811181
; CHECK: vector.body:
11821182
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
11831183
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
1184-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[A:%.*]], i32 [[INDEX]]
1185-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 1
1186-
; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
11871184
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]]
1188-
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
1185+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 1
1186+
; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
1187+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[B1:%.*]], i32 [[INDEX]]
1188+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP11]], align 2
11891189
; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[WIDE_LOAD1]] to <4 x i32>
11901190
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
11911191
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[TMP4]] to <4 x i64>
@@ -1204,10 +1204,10 @@ define i64 @red_mla_ext_s16_u16_s64(ptr noalias nocapture readonly %A, ptr noali
12041204
; CHECK: for.body:
12051205
; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
12061206
; CHECK-NEXT: [[S_010:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1207-
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[I_011]]
1207+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[I_011]]
12081208
; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
12091209
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP9]] to i32
1210-
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[I_011]]
1210+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[B1]], i32 [[I_011]]
12111211
; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
12121212
; CHECK-NEXT: [[CONV2:%.*]] = zext i16 [[TMP10]] to i32
12131213
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
@@ -1266,12 +1266,12 @@ define i32 @red_mla_u8_s8_u32(ptr noalias nocapture readonly %A, ptr noalias noc
12661266
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
12671267
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
12681268
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
1269-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
1270-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP0]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
1271-
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32>
12721269
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
12731270
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
1274-
; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32>
1271+
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32>
1272+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B1:%.*]], i32 [[INDEX]]
1273+
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP9]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
1274+
; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD2]] to <4 x i32>
12751275
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
12761276
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
12771277
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
@@ -1408,8 +1408,8 @@ define i32 @mla_i8_i32_multiuse(ptr nocapture readonly %x, ptr nocapture readonl
14081408
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
14091409
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
14101410
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
1411-
; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
1412-
; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]]
1411+
; CHECK-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
1412+
; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]]
14131413
; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer
14141414
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
14151415
; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]

0 commit comments

Comments
 (0)