-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LV][EVL] Emit vp.merge intrinsic to enable out-loop reduction in EVL vectorization. #101641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Mel Chen (Mel-Chen) ChangesFollowing #90184, this patch introduces a new VPInstruction,
Also, this patch performs transformation to convert
into
to support out-loop reduction using tail folding with EVL. Patch is 131.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101641.diff 10 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c9da5e5d38a6b..4a435d00bf2f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1262,6 +1262,11 @@ class VPInstruction : public VPRecipeWithIRFlags {
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
PtrAdd,
+ // Selects elements from two vectors (second and third operand) based on a
+ // condition vector (first operand) and a pivot index (fourth operand). The
+ // lanes whose positions are greater than or equal to the pivot are taken
+ // from the third operand.
+ MergeUntilPivot,
};
private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 9cd7712624bac..75b88dbd7ddf3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -50,6 +50,17 @@ template <typename Class> struct bind_ty {
}
};
+/// Match a specified VPValue.
+struct specificval_ty {
+ const VPValue *Val;
+
+ specificval_ty(const VPValue *V) : Val(V) {}
+
+ bool match(VPValue *VPV) { return VPV == Val; }
+};
+
+inline specificval_ty m_Specific(const VPValue *VPV) { return VPV; }
+
/// Match a specified integer value or vector of all elements of that
/// value. \p BitWidth optionally specifies the bitwidth the matched constant
/// must have. If it is 0, the matched constant can have any bitwidth.
@@ -197,6 +208,39 @@ using AllBinaryRecipe_match =
BinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative, VPWidenRecipe,
VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>;
+template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode,
+ typename... RecipeTys>
+struct TernaryRecipe_match {
+ Op0_t Op0;
+ Op1_t Op1;
+ Op2_t Op2;
+
+ TernaryRecipe_match(Op0_t Op0, Op1_t Op1, Op2_t Op2)
+ : Op0(Op0), Op1(Op1), Op2(Op2) {}
+
+ bool match(const VPValue *V) {
+ auto *DefR = V->getDefiningRecipe();
+ return DefR && match(DefR);
+ }
+
+ bool match(const VPSingleDefRecipe *R) {
+ return match(static_cast<const VPRecipeBase *>(R));
+ }
+
+ bool match(const VPRecipeBase *R) {
+ if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R))
+ return false;
+ assert(R->getNumOperands() == 3 &&
+ "recipe with matched opcode does not have 3 operands");
+ return Op0.match(R->getOperand(0)) && Op1.match(R->getOperand(1)) &&
+ Op2.match(R->getOperand(2));
+ }
+};
+
+template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode>
+using TernaryVPInstruction_match =
+ TernaryRecipe_match<Op0_t, Op1_t, Op2_t, Opcode, VPInstruction>;
+
template <unsigned Opcode, typename Op0_t>
inline UnaryVPInstruction_match<Op0_t, Opcode>
m_VPInstruction(const Op0_t &Op0) {
@@ -209,6 +253,12 @@ m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1) {
return BinaryVPInstruction_match<Op0_t, Op1_t, Opcode>(Op0, Op1);
}
+template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t>
+inline TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Opcode>
+m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
+ return TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Opcode>(Op0, Op1, Op2);
+}
+
template <typename Op0_t>
inline UnaryVPInstruction_match<Op0_t, VPInstruction::Not>
m_Not(const Op0_t &Op0) {
@@ -304,6 +354,13 @@ m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<VPInstruction::LogicalAnd, Op0_t, Op1_t>(Op0, Op1);
}
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+inline TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Instruction::Select>
+m_Select(const Op0_t &Cond, const Op1_t &LHS, const Op2_t &RHS) {
+ return m_VPInstruction<Instruction::Select, Op0_t, Op1_t, Op2_t>(Cond, LHS,
+ RHS);
+}
+
struct VPCanonicalIVPHI_match {
bool match(const VPValue *V) {
auto *DefR = V->getDefiningRecipe();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d6d67a55c17d..3f5e1c774fa5e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -144,6 +144,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::PtrAdd:
+ case VPInstruction::MergeUntilPivot:
return false;
default:
return true;
@@ -673,7 +674,17 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
}
return NewPhi;
}
-
+ case VPInstruction::MergeUntilPivot: {
+ assert(Part == 0 && "No unrolling expected for predicated vectorization.");
+ Value *Cond = State.get(getOperand(0), Part);
+ Value *OnTrue = State.get(getOperand(1), Part);
+ Value *OnFalse = State.get(getOperand(2), Part);
+ Value *Pivot = State.get(getOperand(3), VPIteration(0, 0));
+ assert(Pivot->getType()->isIntegerTy() && "Pivot should be an integer.");
+ return Builder.CreateIntrinsic(Intrinsic::vp_merge, {OnTrue->getType()},
+ {Cond, OnTrue, OnFalse, Pivot}, nullptr,
+ Name);
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -764,6 +775,9 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::BranchOnCond:
case VPInstruction::ResumePhi:
return true;
+ case VPInstruction::MergeUntilPivot:
+ // Pivot must be an integer.
+ return Op == getOperand(3);
};
llvm_unreachable("switch should return");
}
@@ -782,6 +796,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
case VPInstruction::BranchOnCount:
case VPInstruction::BranchOnCond:
case VPInstruction::CanonicalIVIncrementForPart:
+ case VPInstruction::MergeUntilPivot:
return true;
};
llvm_unreachable("switch should return");
@@ -848,6 +863,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::PtrAdd:
O << "ptradd";
break;
+ case VPInstruction::MergeUntilPivot:
+ O << "merge-until-pivot";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 045f6c356669f..06ee84ea2b5b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1436,14 +1436,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
return isa<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
&Phi);
});
- // FIXME: Remove this once we can transform (select header_mask, true_value,
- // false_value) into vp.merge.
- bool ContainsOutloopReductions =
- any_of(Header->phis(), [&](VPRecipeBase &Phi) {
- auto *R = dyn_cast<VPReductionPHIRecipe>(&Phi);
- return R && !R->isInLoop();
- });
- if (ContainsWidenInductions || ContainsOutloopReductions)
+ if (ContainsWidenInductions)
return false;
auto *CanonicalIVPHI = Plan.getCanonicalIV();
@@ -1474,6 +1467,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
NextEVLIV->insertBefore(CanonicalIVIncrement);
EVLPhi->addOperand(NextEVLIV);
+ using namespace llvm::VPlanPatternMatch;
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
VPRecipeBase *NewRecipe = nullptr;
@@ -1496,6 +1490,16 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
} else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
NewRecipe = new VPReductionEVLRecipe(*RedR, *VPEVL,
GetNewMask(RedR->getCondOp()));
+ } else if (auto *VPInst = dyn_cast<VPInstruction>(CurRecipe)) {
+ VPValue *LHS, *RHS;
+ if (match(VPInst, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
+ m_VPValue(RHS)))) {
+ VPValue *Cond = Plan.getOrAddLiveIn(ConstantInt::getTrue(
+ CanonicalIVPHI->getScalarType()->getContext()));
+ NewRecipe =
+ new VPInstruction(VPInstruction::MergeUntilPivot,
+ {Cond, LHS, RHS, VPEVL}, VPInst->getDebugLoc());
+ }
}
if (NewRecipe) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 8824fa8a16b74..0435a5ee99c33 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -122,19 +122,53 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-OUTLOOP-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; IF-EVL-OUTLOOP: for.body.preheader:
+; IF-EVL-OUTLOOP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-OUTLOOP: vector.ph:
+; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1
+; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP2]]
+; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4
+; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: vector.body:
+; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 4, i1 true)
+; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0
+; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
+; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP9]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sext <vscale x 4 x i16> [[VP_OP_LOAD]] to <vscale x 4 x i32>
+; IF-EVL-OUTLOOP-NEXT: [[TMP11]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP10]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP6]])
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-OUTLOOP: middle.block:
+; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL-OUTLOOP: scalar.ph:
+; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL-OUTLOOP: for.body:
-; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[I_08]]
-; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP15]] to i32
; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
; IF-EVL-OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
; IF-EVL-OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; IF-EVL-OUTLOOP: for.cond.cleanup.loopexit:
-; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
; IF-EVL-OUTLOOP: for.cond.cleanup:
; IF-EVL-OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index 8bde5ba5f1519..cea40f749b729 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -25,20 +25,61 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-LABEL: define i32 @cond_add(
; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
; IF-EVL-OUTLOOP-NEXT: entry:
+; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-OUTLOOP: vector.ph:
+; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: vector.body:
+; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT1:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV1]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true)
+; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV1]], 0
+; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
+; IF-EVL-OUTLOOP-NEXT: [[TMP20]] = add <vscale x 4 x i32> [[TMP19]], [[VEC_PHI]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP11]])
+; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT1]] = add i64 [[TMP22]], [[EVL_BASED_IV1]]
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N_VEC]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-OUTLOOP: middle.block:
+; IF-EVL-OUTLOOP-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-OUTLOOP: scalar.ph:
+; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL-OUTLOOP: for.body:
-; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP27]], 3
; IF-EVL-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP27]], i32 0
; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
-; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1
-; IF-EVL-OUTLOOP-N...
[truncated]
|
Move to draft since I found a bug about ReductionPHI. :[ |
538305b Fixed backedge of ReductionPHI. |
Ping |
@fhahn Change to emit VPWidenIntrinsicRecipe. If you think this appoarch is better, please let me know. I will remove unused legacy code for VPInstruction::MergeUntilPivot later. |
Rebased, and updated commit title and log. Please take a look, thanks. |
if (CM.usePredicatedReductionSelect( | ||
PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be sufficient to adjust the reduction phi recipe when introducing EVL recipes instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It can be adjusted in the EVL transformation. patch 8a3982f
But I don't recommend this. Such an implementation is more complicated, especially after the non-predicated reduction select may sink out of the vectorized loop in the future VPlan transformation.
Could you point out why you want to adjust it in the EVL transformation?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At the moment, everything EVL related is applied during the transform that introduces EVL recipes; one potential issue is that we assume EVL is used here, but the transform may not apply.
Don't have any strong preferences, doing it later indeed seems to require some extra work
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You've raised a good point.
Adjusting the reduction phi too early can indeed cause some issues. Fortunately, this issue is related to performance rather than correctness. We can proceed with this approach for now and address this performance issue in a later patch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good. to me
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
if (CM.usePredicatedReductionSelect( | ||
PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good. to me
…t model In llvm#101641, support for out of loop reductions with EVL tail folding was added by transforming selects to vp_merges in transformRecipestoEVLRecipes. Whilst the select was previously free, the vp_merge wasn't and incurs a cost on RISC-V with the VPlan cost model. But this diverged from the legacy cost model and caused the "VPlan cost model and legacy cost model disagreed" assertion to trigger when building 502.gcc_r from SPEC CPU 2017. Neither the select nor vp_merge recipes from the VPlan exist in the underlying instructions, so I thought it would make the most sense to fix this by adding the cost to the underlying phi instruction in getInstructionCost. It's worth noting that on RISC-V this vp_merge won't actually generate any instructions because the mask is all true, and will be folded away. So we should update the cost model at some point to reflect that.
…t model (#115903) In #101641, support for out of loop reductions with EVL tail folding was added by transforming selects to vp_merges in transformRecipestoEVLRecipes. Whilst the select was previously free, the vp_merge wasn't and incurs a cost on RISC-V with the VPlan cost model. But this diverged from the legacy cost model and caused the "VPlan cost model and legacy cost model disagreed" assertion to trigger when building 502.gcc_r from SPEC CPU 2017. Neither the select nor vp_merge recipes from the VPlan exist in the underlying instructions, so I thought it would make the most sense to fix this by adding the cost to the underlying phi instruction in getInstructionCost. It's worth noting that on RISC-V this vp_merge won't actually generate any instructions because the mask is all true, and will be folded away. So we should update the cost model at some point to reflect that.
Following #90184, this patch emits vp.merge intrinsic, which is used to set the inactive lanes in a select operation to the RHS instead of undef. Currently, it is applied to out-loop reduction for EVL vectorization.
This patch performs transformation to convert
into
And always use the predicated reduction select to set the incoming value of the reduction phi to support out-loop reduction when using tail folding with EVL.
TODO: Postpone the adjustment of the predicated reduction select to VPlanTransform. The current adjustment might be too early, which could lead to a situation where the predicated reduction select is adjusted, but the EVL recipes cannot be successfully generated during VPlanTransform.