llvm · SamTebbs33 · Apr 16, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -223,6 +223,8 @@ class TargetTransformInfo {
   /// Get the kind of extension that an instruction represents.
   LLVM_ABI static PartialReductionExtendKind
   getPartialReductionExtendKind(Instruction *I);
+  static PartialReductionExtendKind
+  getPartialReductionExtendKind(Instruction::CastOps ExtOpcode);
 
   /// Construct a TTI object using a type implementing the \c Concept
   /// API below.

diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -995,13 +995,24 @@ InstructionCost TargetTransformInfo::getShuffleCost(
 
 TargetTransformInfo::PartialReductionExtendKind
 TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
-  if (isa<SExtInst>(I))
-    return PR_SignExtend;
-  if (isa<ZExtInst>(I))
-    return PR_ZeroExtend;
+  if (auto *Cast = dyn_cast<CastInst>(I))
+    return getPartialReductionExtendKind(Cast->getOpcode());
   return PR_None;
 }
 
+TargetTransformInfo::PartialReductionExtendKind
+TargetTransformInfo::getPartialReductionExtendKind(
+    Instruction::CastOps ExtOpcode) {
+  switch (ExtOpcode) {
+  case Instruction::CastOps::ZExt:
+    return PR_ZeroExtend;
+  case Instruction::CastOps::SExt:
+    return PR_SignExtend;
+  default:
+    llvm_unreachable("Unexpected cast opcode");
+  }
+}
+
 TTI::CastContextHint
 TargetTransformInfo::getCastContextHint(const Instruction *I) {
   if (!I)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8639,9 +8639,6 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
            "Expected an ADD or SUB operation for predicated partial "
            "reductions (because the neutral element in the mask is zero)!");
     Cond = getBlockInMask(Builder.getInsertBlock());
-    VPValue *Zero =
-        Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
-    BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
   }
   return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
                                       ScaleFactor, Reduction);

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2470,7 +2470,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
     return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
            R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
            R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
-           R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
+           R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC ||
+           R->getVPDefID() == VPRecipeBase::VPPartialReductionSC;
   }
 
   static inline bool classof(const VPUser *U) {
@@ -2559,6 +2560,9 @@ class VPPartialReductionRecipe : public VPReductionRecipe {
   /// Get the factor that the VF of this recipe's output should be scaled by.
   unsigned getVFScaleFactor() const { return VFScaleFactor; }
 
+  /// Get the binary op this reduction is applied to.
+  VPValue *getBinOp() const { return getOperand(1); }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2694,6 +2698,10 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
   /// The scalar type after extending.
   Type *ResultTy = nullptr;
 
+  /// The scaling factor, relative to the VF, that this recipe's output is
+  /// divided by
+  unsigned VFScaleFactor = 1;
+
   /// For cloning VPMulAccumulateReductionRecipe.
   VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc)
       : VPReductionRecipe(
@@ -2703,22 +2711,25 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
             WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
             MulAcc->getDebugLoc()),
         ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
-        ResultTy(MulAcc->getResultType()) {
+        ResultTy(MulAcc->getResultType()),
+        VFScaleFactor(MulAcc->getVFScaleFactor()) {
     transferFlags(*MulAcc);
     setUnderlyingValue(MulAcc->getUnderlyingValue());
   }
 
 public:
   VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
                                  VPWidenCastRecipe *Ext0,
-                                 VPWidenCastRecipe *Ext1, Type *ResultTy)
+                                 VPWidenCastRecipe *Ext1, Type *ResultTy,
+                                 unsigned ScaleFactor = 1)
       : VPReductionRecipe(
             VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
             {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)},
             R->getCondOp(), R->isOrdered(),
             WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
             R->getDebugLoc()),
-        ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) {
+        ExtOp(Ext0->getOpcode()), ResultTy(ResultTy),
+        VFScaleFactor(ScaleFactor) {
     assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
                Instruction::Add &&
            "The reduction instruction in MulAccumulateteReductionRecipe must "
@@ -2791,6 +2802,10 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
 
   /// Return true if the operand extends have the non-negative flag.
   bool isNonNeg() const { return IsNonNeg; }
+
+  /// Return the scaling factor that the VF is divided by to form the recipe's
+  /// output
+  unsigned getVFScaleFactor() const { return VFScaleFactor; }
 };
 
 /// VPReplicateRecipe replicates a given instruction producing multiple scalar

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -159,6 +159,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPWidenIntrinsicSC:
     return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
   case VPBlendSC:
+  case VPPartialReductionSC:
   case VPReductionEVLSC:
   case VPReductionSC:
   case VPExtendedReductionSC:
@@ -295,14 +296,9 @@ InstructionCost
 VPPartialReductionRecipe::computeCost(ElementCount VF,
                                       VPCostContext &Ctx) const {
   std::optional<unsigned> Opcode = std::nullopt;
-  VPValue *BinOp = getOperand(1);
+  VPValue *BinOp = getBinOp();
 
-  // If the partial reduction is predicated, a select will be operand 0 rather
-  // than the binary op
   using namespace llvm::VPlanPatternMatch;
-  if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(), m_VPValue())))
-    BinOp = BinOp->getDefiningRecipe()->getOperand(1);
-
   // If BinOp is a negation, use the side effect of match to assign the actual
   // binary operation to BinOp
   match(BinOp, m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(BinOp)));
@@ -345,12 +341,18 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
   assert(getOpcode() == Instruction::Add &&
          "Unhandled partial reduction opcode");
 
-  Value *BinOpVal = State.get(getOperand(1));
-  Value *PhiVal = State.get(getOperand(0));
+  Value *BinOpVal = State.get(getBinOp());
+  Value *PhiVal = State.get(getChainOp());
   assert(PhiVal && BinOpVal && "Phi and Mul must be set");
 
   Type *RetTy = PhiVal->getType();
 
+  /// Mask the bin op output.
+  if (VPValue *Cond = getCondOp()) {
+    Value *Zero = ConstantInt::get(BinOpVal->getType(), 0);
+    BinOpVal = Builder.CreateSelect(State.get(Cond), BinOpVal, Zero);
+  }
+
   CallInst *V = Builder.CreateIntrinsic(
       RetTy, Intrinsic::experimental_vector_partial_reduce_add,
       {PhiVal, BinOpVal}, nullptr, "partial.reduce");
@@ -2570,6 +2572,14 @@ VPExtendedReductionRecipe::computeCost(ElementCount VF,
 InstructionCost
 VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
                                             VPCostContext &Ctx) const {
+  if (getVFScaleFactor() > 1) {
+    return Ctx.TTI.getPartialReductionCost(
+        Instruction::Add, Ctx.Types.inferScalarType(getVecOp0()),
+        Ctx.Types.inferScalarType(getVecOp1()), getResultType(), VF,
+        TTI::getPartialReductionExtendKind(getExtOpcode()),
+        TTI::getPartialReductionExtendKind(getExtOpcode()), Instruction::Mul);
+  }
+
   Type *RedTy = Ctx.Types.inferScalarType(this);
   auto *SrcVecTy =
       cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF));
@@ -2648,6 +2658,8 @@ void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent,
   O << " = ";
   getChainOp()->printAsOperand(O, SlotTracker);
   O << " + ";
+  if (getVFScaleFactor() > 1)
+    O << "partial.";
   O << "reduce."
     << Instruction::getOpcodeName(
            RecurrenceDescriptor::getOpcode(getRecurrenceKind()))

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2581,9 +2581,15 @@ expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
       MulAcc->hasNoSignedWrap(), MulAcc->getDebugLoc());
   Mul->insertBefore(MulAcc);
 
-  auto *Red = new VPReductionRecipe(
-      MulAcc->getRecurrenceKind(), FastMathFlags(), MulAcc->getChainOp(), Mul,
-      MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc());
+  // Generate VPReductionRecipe.
+  VPReductionRecipe *Red = nullptr;
+  if (unsigned ScaleFactor = MulAcc->getVFScaleFactor(); ScaleFactor > 1)
+    Red = new VPPartialReductionRecipe(Instruction::Add, MulAcc->getChainOp(),
+                                       Mul, MulAcc->getCondOp(), ScaleFactor);
+  else
+    Red = new VPReductionRecipe(MulAcc->getRecurrenceKind(), FastMathFlags(),
+                                MulAcc->getChainOp(), Mul, MulAcc->getCondOp(),
+                                MulAcc->isOrdered(), MulAcc->getDebugLoc());
   Red->insertBefore(MulAcc);
 
   MulAcc->replaceAllUsesWith(Red);
@@ -2911,12 +2917,43 @@ static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
   Red->replaceAllUsesWith(AbstractR);
 }
 
+/// This function tries to create an abstract recipe from a partial reduction to
+/// hide its mul and extends from cost estimation.
+static void
+tryToCreateAbstractPartialReductionRecipe(VPPartialReductionRecipe *PRed) {
+  if (PRed->getOpcode() != Instruction::Add)
+    return;
+
+  using namespace llvm::VPlanPatternMatch;
+  auto *BinOp = PRed->getBinOp();
+  if (!match(BinOp,
+             m_Mul(m_ZExtOrSExt(m_VPValue()), m_ZExtOrSExt(m_VPValue()))))
+    return;
+
+  auto *BinOpR = cast<VPWidenRecipe>(BinOp->getDefiningRecipe());
+  VPWidenCastRecipe *Ext0R = dyn_cast<VPWidenCastRecipe>(BinOpR->getOperand(0));
+  VPWidenCastRecipe *Ext1R = dyn_cast<VPWidenCastRecipe>(BinOpR->getOperand(1));
+
+  // TODO: Make work with extends of different signedness
+  if (Ext0R->hasMoreThanOneUniqueUser() || Ext1R->hasMoreThanOneUniqueUser() ||
+      Ext0R->getOpcode() != Ext1R->getOpcode())
+    return;
+
+  auto *AbstractR = new VPMulAccumulateReductionRecipe(
+      PRed, BinOpR, Ext0R, Ext1R, Ext0R->getResultType(),
+      PRed->getVFScaleFactor());
+  AbstractR->insertBefore(PRed);
+  PRed->replaceAllUsesWith(AbstractR);
+}
+
 void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
                                                VFRange &Range) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
     for (VPRecipeBase &R : *VPBB) {
-      if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
+      if (auto *PRed = dyn_cast<VPPartialReductionRecipe>(&R))
+        tryToCreateAbstractPartialReductionRecipe(PRed);
+      else if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
         tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
     }
   }