-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LV] Vectorize histogram operations #99851
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Vectorize histogram operations #99851
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Graham Hunter (huntergr-arm) ChangesThis patch implements autovectorization support for the 'all-in-one' histogram intrinsic, which seems to have more support than the 'standalone' intrinsic. See https://discourse.llvm.org/t/rfc-vectorization-support-for-histogram-count-operations/74788/ for an overview of the work and my notes on the tradeoffs between the two approaches. This is a new PR after the previous one (#91458) was reverted at maintainer request. I have removed the changes to LoopAccessAnalysis in favor continuing analysis in LoopVectorizationLegality. Patch is 57.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/99851.diff 11 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 2ff17bd2f7a71..372ac6ee3fb00 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -224,6 +224,18 @@ class LoopVectorizationRequirements {
Instruction *ExactFPMathInst = nullptr;
};
+/// This holds details about a histogram operation -- a load -> update -> store
+/// sequence where each lane in a vector might be updating the same element as
+/// another lane.
+struct HistogramInfo {
+ LoadInst *Load;
+ Instruction *Update;
+ StoreInst *Store;
+
+ HistogramInfo(LoadInst *Load, Instruction *Update, StoreInst *Store)
+ : Load(Load), Update(Update), Store(Store) {}
+};
+
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
/// to what vectorization factor.
/// This class does not look at the profitability of vectorization, only the
@@ -390,6 +402,22 @@ class LoopVectorizationLegality {
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }
+ /// Returns a HistogramInfo* for the given instruction if it was determined
+ /// to be part of a load -> update -> store sequence where multiple lanes
+ /// may be working on the same memory address.
+ std::optional<const HistogramInfo *> getHistogramInfo(Instruction *I) const {
+ for (const HistogramInfo &HGram : Histograms)
+ if (HGram.Load == I || HGram.Update == I || HGram.Store == I)
+ return &HGram;
+
+ return std::nullopt;
+ }
+
+ /// Returns a list of all known histogram operations in the loop.
+ const SmallVectorImpl<HistogramInfo> &getHistograms() const {
+ return Histograms;
+ }
+
PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
return &PSE;
}
@@ -438,6 +466,11 @@ class LoopVectorizationLegality {
/// Returns true if the loop is vectorizable
bool canVectorizeMemory();
+ /// If LAA cannot determine whether all dependences are safe, we may be able
+ /// to further analyse some unknown dependences and if they match a certain
+ /// pattern (like a histogram) then we may still be able to vectorize.
+ bool canVectorizeUnknownDependences();
+
/// Return true if we can vectorize this loop using the IF-conversion
/// transformation.
bool canVectorizeWithIfConvert();
@@ -542,6 +575,11 @@ class LoopVectorizationLegality {
/// conditional assumes.
SmallPtrSet<const Instruction *, 8> MaskedOp;
+ /// Contains all identified histogram operations, which are sequences of
+ /// load -> update -> store instructions where multiple lanes in a vector
+ /// may work on the same memory location.
+ SmallVector<HistogramInfo, 1> Histograms;
+
/// BFI and PSI are used to check for profile guided size optimizations.
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index cafec165f6d6f..e128c4124e1f8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -78,6 +78,10 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
"Scalable vectorization is available and favored when the "
"cost is inconclusive.")));
+static cl::opt<bool> EnableHistogramVectorization(
+ "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
+ cl::desc("Enables autovectorization of some loops containing histograms"));
+
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
@@ -1054,6 +1058,110 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
return true;
}
+/// Find Histogram counts that match high-level code in loops:
+/// \code
+/// buckets[indices[i]]+=step;
+/// \endcode
+///
+/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
+/// array the computed histogram. It uses a BinOp to sum all counts, storing
+/// them using a loop-variant index Load from the 'indices' input array.
+///
+/// On successful matches it updates the STATISTIC 'HistogramsDetected',
+/// regardless of hardware support. When there is support, it additionally
+/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
+/// used to update histogram in \p HistogramPtrs.
+
+static bool findHistograms(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
+ const PredicatedScalarEvolution &PSE,
+ SmallVectorImpl<HistogramInfo> &Histograms) {
+
+ // Store value must come from a Binary Operation.
+ Instruction *HPtrInstr = nullptr;
+ BinaryOperator *HBinOp = nullptr;
+ if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr))))
+ return false;
+
+ // BinOp must be an Add or a Sub modifying the bucket value by a
+ // loop invariant amount.
+ // FIXME: We assume the loop invariant term is on the RHS.
+ // Fine for an immediate/constant, but maybe not a generic value?
+ Value *HIncVal = nullptr;
+ if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+ !match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))))
+ return false;
+
+ // Make sure the increment value is loop invariant.
+ if (!TheLoop->isLoopInvariant(HIncVal))
+ return false;
+
+ // The address to store is calculated through a GEP Instruction.
+ // FIXME: Support GEPs with more operands.
+ GetElementPtrInst *HPtr = dyn_cast<GetElementPtrInst>(HPtrInstr);
+ if (!HPtr || HPtr->getNumOperands() > 2)
+ return false;
+
+ // Check that the index is calculated by loading from another array. Ignore
+ // any extensions.
+ // FIXME: Support indices from other sources that a linear load from memory?
+ Value *HIdx = HPtr->getOperand(1);
+ Instruction *IdxInst = nullptr;
+ if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Instruction(IdxInst))))
+ return false;
+
+ // Currently restricting this to linear addressing when loading indices.
+ LoadInst *VLoad = dyn_cast<LoadInst>(IdxInst);
+ Value *VPtrVal;
+ if (!VLoad || !match(VLoad, m_Load(m_Value(VPtrVal))))
+ return false;
+
+ if (!isa<SCEVAddRecExpr>(PSE.getSE()->getSCEV(VPtrVal)))
+ return false;
+
+ // Ensure we'll have the same mask by checking that all parts of the histogram
+ // (gather load, update, scatter store) are in the same block.
+ LoadInst *IndexedLoad = cast<LoadInst>(HBinOp->getOperand(0));
+ BasicBlock *LdBB = IndexedLoad->getParent();
+ if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
+
+ // Store the operations that make up the histogram.
+ Histograms.emplace_back(IndexedLoad, HBinOp, HSt);
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeUnknownDependences() {
+ // For now, we only support an unknown dependency that calculates a histogram
+ if (!EnableHistogramVectorization)
+ return false;
+
+ // FIXME: Support more than one unknown dependence, and check to see if some
+ // are handled by runtime checks before looking for histograms.
+ LAI = &LAIs.getInfo(*TheLoop);
+ const MemoryDepChecker &DepChecker = LAI->getDepChecker();
+ const auto *Deps = DepChecker.getDependences();
+ if (!Deps || Deps->size() > 1)
+ return false;
+
+ const MemoryDepChecker::Dependence &Dep = (*Deps).front();
+
+ // We're only interested in unknown dependences.
+ if (Dep.Type != MemoryDepChecker::Dependence::Unknown)
+ return false;
+
+ // For now only normal loads and stores are supported.
+ LoadInst *LI = dyn_cast<LoadInst>(Dep.getSource(DepChecker));
+ StoreInst *SI = dyn_cast<StoreInst>(Dep.getDestination(DepChecker));
+
+ if (!LI || !SI)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n");
+ return findHistograms(LI, SI, TheLoop, LAI->getPSE(), Histograms);
+}
+
bool LoopVectorizationLegality::canVectorizeMemory() {
LAI = &LAIs.getInfo(*TheLoop);
const OptimizationRemarkAnalysis *LAR = LAI->getReport();
@@ -1065,7 +1173,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
}
if (!LAI->canVectorizeMemory())
- return false;
+ return canVectorizeUnknownDependences();
if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
reportVectorizationFailure("We don't allow storing to uniform addresses",
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6d28b8fabe42e..823c95d8bb4f9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4622,6 +4622,10 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
return false;
+ // Loops containing histograms are not currently supported.
+ if (!Legal->getHistograms().empty())
+ return false;
+
return true;
}
@@ -6465,8 +6469,33 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// We've proven all lanes safe to speculate, fall through.
[[fallthrough]];
case Instruction::Add:
+ case Instruction::Sub: {
+ auto Info = Legal->getHistogramInfo(I);
+ if (Info && VF.isVector()) {
+ const HistogramInfo *HGram = Info.value();
+ // Assume that a non-constant update value (or a constant != 1) requires
+ // a multiply, and add that into the cost.
+ InstructionCost MulCost = TTI::TCC_Free;
+ ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!RHS || RHS->getZExtValue() != 1)
+ MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
+
+ // Find the cost of the histogram operation itself.
+ Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
+ Type *ScalarTy = I->getType();
+ Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
+ IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
+ Type::getVoidTy(I->getContext()),
+ {PtrTy, ScalarTy, MaskTy});
+
+ // Add the costs together with the add/sub operation.
+ return TTI.getIntrinsicInstrCost(
+ ICA, TargetTransformInfo::TCK_RecipThroughput) +
+ MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
+ }
+ [[fallthrough]];
+ }
case Instruction::FAdd:
- case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
@@ -8173,6 +8202,36 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
};
}
+VPHistogramRecipe *
+VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
+ ArrayRef<VPValue *> Operands) {
+ // FIXME: Support other operations.
+ unsigned Opcode = HI->Update->getOpcode();
+ assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
+ "Histogram update operation must be an Add or Sub");
+
+ SmallVector<VPValue *, 3> HGramOps;
+ // Bucket address.
+ HGramOps.push_back(Operands[1]);
+ // Increment value.
+ HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1), Plan));
+
+ // In case of predicated execution (due to tail-folding, or conditional
+ // execution, or both), pass the relevant mask. When there is no such mask,
+ // generate an all-true mask.
+ VPValue *Mask = nullptr;
+ if (Legal->isMaskRequired(HI->Store))
+ Mask = getBlockInMask(HI->Store->getParent());
+ else
+ Mask = Plan.getOrAddLiveIn(
+ ConstantInt::getTrue(IntegerType::getInt1Ty(HI->Load->getContext())));
+ HGramOps.push_back(Mask);
+
+ return new VPHistogramRecipe(HI, Opcode,
+ make_range(HGramOps.begin(), HGramOps.end()),
+ HI->Store->getDebugLoc());
+}
+
void VPRecipeBuilder::fixHeaderPhis() {
BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
for (VPHeaderPHIRecipe *R : PhisToFix) {
@@ -8296,6 +8355,10 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (auto *CI = dyn_cast<CallInst>(Instr))
return tryToWidenCall(CI, Operands, Range);
+ if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
+ if (auto HistInfo = Legal->getHistogramInfo(SI))
+ return tryToWidenHistogram(*HistInfo, Operands);
+
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
@@ -8563,6 +8626,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
Operands = {OpRange.begin(), OpRange.end()};
}
+ // If this is a load instruction or a binop associated with a histogram,
+ // leave it until the store instruction to emit a combined intrinsic.
+ // Note that if the initial VF is scalar, we need to generate the normal
+ // clone recipe for these instructions. A histogram recipe will only be
+ // generated when minVF > 1.
+ if (Legal->getHistogramInfo(Instr) && !isa<StoreInst>(Instr) &&
+ !Range.Start.isScalar())
+ continue;
+
// Invariant stores inside loop will be deleted and a single store
// with the final reduction value will be added to the exit block
StoreInst *SI;
@@ -9890,6 +9962,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
InterleaveLoop = false;
}
+ // If there is a histogram in the loop, do not just interleave without
+ // vectorizing. The order of operations will be incorrect without the
+ // histogram intrinsics, which are only used for recipes with VF > 1.
+ if (!VectorizeLoop && InterleaveLoop && !LVL.getHistograms().empty()) {
+ LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
+ << "to histogram operations.\n");
+ IntDiagMsg = std::make_pair(
+ "HistogramPreventsScalarInterleaving",
+ "Unable to interleave without vectorization due to constraints on "
+ "the order of histogram operations");
+ InterleaveLoop = false;
+ }
+
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b4c7ab02f928f..b3b25047820f1 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -20,6 +20,7 @@ namespace llvm {
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class TargetLibraryInfo;
+struct HistogramInfo;
/// Helper class to create VPRecipies from IR instructions.
class VPRecipeBuilder {
@@ -102,6 +103,13 @@ class VPRecipeBuilder {
VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
VPBasicBlock *VPBB);
+ /// Makes Histogram count operations safe for vectorization, by emitting a
+ /// llvm.experimental.vector.histogram.add intrinsic in place of the
+ /// Load + Add|Sub + Store operations that perform the histogram in the
+ /// original scalar loop.
+ VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
+ ArrayRef<VPValue *> Operands);
+
public:
VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
LoopVectorizationLegality *Legal,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 805d9d91fc186..7d41aba3f827d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -69,6 +69,7 @@ class LoopVectorizationCostModel;
class LoopVersioning;
struct VPCostContext;
+struct HistogramInfo;
namespace Intrinsic {
typedef unsigned ID;
@@ -937,6 +938,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
case VPRecipeBase::VPWidenStoreSC:
+ case VPRecipeBase::VPHistogramSC:
// TODO: Widened stores don't define a value, but widened loads do. Split
// the recipes to be able to make widened loads VPSingleDefRecipes.
return false;
@@ -1573,6 +1575,39 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
#endif
};
+class VPHistogramRecipe : public VPRecipeBase {
+ const HistogramInfo *Info;
+ unsigned Opcode;
+
+public:
+ template <typename IterT>
+ VPHistogramRecipe(const HistogramInfo *HI, unsigned Opcode,
+ iterator_range<IterT> Operands, DebugLoc DL = {})
+ : VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Info(HI),
+ Opcode(Opcode) {}
+
+ ~VPHistogramRecipe() override = default;
+
+ VPHistogramRecipe *clone() override {
+ llvm_unreachable("cloning not supported");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPHistogramSC);
+
+ // Produce a histogram operation with widened ingredients
+ void execute(VPTransformState &State) override;
+
+ unsigned getOpcode() const { return Opcode; }
+
+ const HistogramInfo *getHistogramInfo() const { return Info; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for widening select instructions.
struct VPWidenSelectRecipe : public VPSingleDefRecipe {
template <typename IterT>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1b787d0490672..7701300d9a273 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
@@ -30,6 +31,7 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <cassert>
using namespace llvm;
@@ -279,6 +281,10 @@ static Instruction *getInstructionForCost(const VPRecipeBase *R) {
return IG->getInsertPos();
if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
return &WidenMem->getIngredient();
+ // FIXME: Override the cost method properly to take gather/scatter cost
+ // into account, instead of just the intrinsic via the legacy model.
+ if (auto *HG = dyn_cast<VPHistogramRecipe>(R))
+ return HG->getHistogramInfo()->Update;
return nullptr;
}
@@ -958,6 +964,44 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
O << ")";
}
}
+#endif
+
+void VPHistogramRecipe::execute(VPTransformState &State) {
+ State.setDebugLocFrom(getDebugLoc());
+ IRBuilderBase &Builder = State.Builder;
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *Address = State.get(getOperand(0), Part);
+ Value *IncVec = State.get(getOperand(1), Part);
+ Value *Mask = State.get(getOperand(2), Part);
+
+ // Not sure how to make IncAmt stay scalar yet. For now just extract the
+ // first element and tidy up later.
+ // FIXME: Do we actually want this to be scalar? We just splat it in the
+ // backend anyway...
+ Value *IncAmt = Builder.CreateExtractElement(IncVec, Builder.getInt64(0));
+
+ // If this is a subtract, we want to invert the increment amount. We may
+ // add a separate intrinsic in future, but for now we'll try this.
+ if (Opcode == Instruction::Sub)
+ IncAmt = Builder.CreateNeg(IncAmt);
+
+ State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
+ {Address->getType(), IncAmt->getType()},
+ {Address, IncAmt, Mask});
+ }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN-HISTOGRAM buckets: ";
+ getOperand(0)->printAsOperand(O, SlotTracker);
+ O << ", inc: ";
+ getOperand(1)-...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was only able to take a quick look at the changes; they do look good.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me as well, thank you.
// Loops containing histograms are not currently supported. | ||
if (!Legal->getHistograms().empty()) | ||
return false; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Loops containing histograms are not currently supported. | |
if (!Legal->getHistograms().empty()) | |
return false; | |
return Legal->getHistograms().empty() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
if (Legal->isMaskRequired(HI->Store)) | ||
Mask = getBlockInMask(HI->Store->getParent()); | ||
else | ||
Mask = Plan.getOrAddLiveIn( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For loads/stores the mask is optional, instead of providing an all true mask can we instead not add a mask if there's no mask needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done. The intrinsic requires a mask, but the recipe doesn't. The all-true mask is now synthesized in the execute method.
I also improved recipe printing to use 'dec' instead of 'inc' if the opcode was a subtract.
// Note that if the initial VF is scalar, we need to generate the normal | ||
// clone recipe for these instructions. A histogram recipe will only be | ||
// generated when minVF > 1. | ||
if (Legal->getHistogramInfo(Instr) && !isa<StoreInst>(Instr) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be sufficient to leave to load around, to be cleaned up later by DCE?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
@@ -1573,6 +1575,39 @@ class VPWidenCallRecipe : public VPSingleDefRecipe { | |||
#endif | |||
}; | |||
|
|||
class VPHistogramRecipe : public VPRecipeBase { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please document the new recipe
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
@@ -1573,6 +1575,39 @@ class VPWidenCallRecipe : public VPSingleDefRecipe { | |||
#endif | |||
}; | |||
|
|||
class VPHistogramRecipe : public VPRecipeBase { | |||
const HistogramInfo *Info; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can store by reference?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
/// regardless of hardware support. When there is support, it additionally | ||
/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers | ||
/// used to update histogram in \p HistogramPtrs. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: remove newline?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers | ||
/// used to update histogram in \p HistogramPtrs. | ||
|
||
static bool findHistograms(LoadInst *LI, StoreInst *HSt, Loop *TheLoop, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: findHistograms as this only looks for a single one?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
@@ -0,0 +1,44 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 | |||
; RUN: opt < %s -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -S | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would this vectorize with -force-vector-width=2
or something like that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added the flag, but it won't vectorize at present due to the cost being Invalid. Forcing the instruction cost to 1 no longer works, so this will have to wait for an improved cost model.
|
||
// If this is a subtract, we want to invert the increment amount. We may | ||
// add a separate intrinsic in future, but for now we'll try this. | ||
if (Opcode == Instruction::Sub) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
assert that opcode is :Add in the else?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
LoadInst *LI = dyn_cast<LoadInst>(Dep.getSource(DepChecker)); | ||
StoreInst *SI = dyn_cast<StoreInst>(Dep.getDestination(DepChecker)); | ||
|
||
if (!LI || !SI) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just checking if we have a test where the source is a store and dest is a load, possibly for loops stepping backwards?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't.
I tried creating one but was never able to get LAA to recognize the store as a source -- it always gave up before that point, possibly due to the indirection. Maybe I'm missing something.
dd1f39a
to
0eafbf8
Compare
Rebased, addressed comments, added some more tests. |
@fhahn ping -- do the latest changes sufficiently address your concerns? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the update! some additional comments inline
} | ||
|
||
/// Returns a list of all known histogram operations in the loop. | ||
const SmallVectorImpl<HistogramInfo> &getHistograms() const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
looking at the users of this function, might be cleared to have hasHistograms()
instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
@@ -438,6 +466,11 @@ class LoopVectorizationLegality { | |||
/// Returns true if the loop is vectorizable | |||
bool canVectorizeMemory(); | |||
|
|||
/// If LAA cannot determine whether all dependences are safe, we may be able | |||
/// to further analyse some unknown dependences and if they match a certain |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a recent LAA fix, and the dependences should now be classified as IndirectUnsafe
I think. Would be good to update the comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
} | ||
|
||
bool LoopVectorizationLegality::canVectorizeUncheckedDependences() { | ||
// For now, we only support an unknown dependency that calculates a histogram |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unknown-> IndirectUnsafe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
if (!EnableHistogramVectorization) | ||
return false; | ||
|
||
// FIXME: Support more than one unchecked dependence, and check to see if some |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think with the change to use IndirectUnsafe, unknown dependences should be fine, as we can generate runtime checks for those
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I hadn't appreciated that part of the LAA change. done, and I've added a test with a second dependency.
|
||
const MemoryDepChecker::Dependence &Dep = (*Deps).front(); | ||
|
||
// We're only interested in Unknown or IndirectUnsafe dependences. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it sufficient to look for IndirectUnsafe dependences here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
if (getNumOperands() == 3) | ||
return getOperand(2); | ||
return nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (getNumOperands() == 3) | |
return getOperand(2); | |
return nullptr; | |
return getNumOperands() == 3 ? getOperand(2) : nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
Mask = Builder.CreateVectorSplat( | ||
VTy->getElementCount(), ConstantInt::getTrue(Builder.getInt1Ty())); | ||
|
||
// Not sure how to make IncAmt stay scalar yet. For now just extract the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sufficient to implement VPHistogramRecipe::onlyFirstLaneUsed
and return true for the inc operand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That didn't work, but supplying true
to the IsScalar parameter of State.get did.
if (Opcode == Instruction::Sub) | ||
IncAmt = Builder.CreateNeg(IncAmt); | ||
else | ||
assert(Opcode == Instruction::Add); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
assert(Opcode == Instruction::Add); | |
assert(Opcode == Instruction::Add && "only add or sub supported for now"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
@@ -30,6 +31,7 @@ | |||
#include "llvm/Transforms/Utils/BasicBlockUtils.h" | |||
#include "llvm/Transforms/Utils/LoopUtils.h" | |||
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" | |||
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be better to define HistogramInfo in VPlan.h instead of pulling LoopVectorizationLegality.h
in here, as VPlanRecipes.cpp should not depend/rely on LVL
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sadly, that's not quite possible at the moment. LoopVectorizationLegality.h needs the definition of HistogramInfo since it creates a SmallVector of them, and accesses the fields. I can't include VPlan.h in the other header, since VPlan.h is under the lib/ subtree instead of the include/ subtree. I could perhaps try to fudge it by always including VPlan.h first in any .cpp file, but that feels fragile (and clang-format doesn't like it anyway).
I suspect a shared header under include/ would be best. Or have I missed something?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like VPHistogramRecipe only needs Update
of HistogramInfo
and that only for retrieving the legacy cost. Ideally we would include VPHistogramRecipe::computeCost
in this patch, avoiding the need for accessing Update
.
If that isn't easily possibly, can we just store Update
(with a comment what it is used for)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still pending?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was done with the last round of updates. Old diff?
148407b
to
199862b
Compare
Rebased and added REQUIRES lines to tests using debug capabilities. Any additional comments? |
@@ -274,6 +276,10 @@ static Instruction *getInstructionForCost(const VPRecipeBase *R) { | |||
return IG->getInsertPos(); | |||
if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) | |||
return &WidenMem->getIngredient(); | |||
// FIXME: Override the cost method properly to take gather/scatter cost |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Histogram cost computation looks self contained, can we instead directly implement VPHistogramRecipe::computeCost
instead falling back?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, can the FIXME & histogram case below be removed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They were removed. Are you looking at a cached version of the diff?
|
||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" | ||
|
||
define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
might be worth adding a comment here saying what we expect here, i.e. no vectorization as (generic) target doesn't support histogram intrinsics?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
; CHECK-NEXT: WIDEN ir<%0> = load vp<%4> | ||
; CHECK-NEXT: WIDEN-CAST ir<%idxprom1> = zext ir<%0> to i64 | ||
; CHECK-NEXT: WIDEN-GEP Inv[Var] ir<%arrayidx2> = getelementptr inbounds ir<%buckets>, ir<%idxprom1> | ||
; CHECK-NEXT: WIDEN-HISTOGRAM buckets: ir<%arrayidx2>, inc: ir<1> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would probably better to have a separate test file to check printing of a plan with a histogram recipe.
It should be sufficient to check printing for a single loop; Ideally also avoiding hard-coded vp<%xx>
as this makes the tests fragile.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done, sve2-histcnt-vplan.ll
@@ -30,6 +31,7 @@ | |||
#include "llvm/Transforms/Utils/BasicBlockUtils.h" | |||
#include "llvm/Transforms/Utils/LoopUtils.h" | |||
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" | |||
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like VPHistogramRecipe only needs Update
of HistogramInfo
and that only for retrieving the legacy cost. Ideally we would include VPHistogramRecipe::computeCost
in this patch, avoiding the need for accessing Update
.
If that isn't easily possibly, can we just store Update
(with a comment what it is used for)?
%arrayidx = getelementptr inbounds i32, ptr %indices, i64 %iv | ||
%0 = load i32, ptr %arrayidx, align 4 | ||
%idxprom1 = zext i32 %0 to i64 | ||
%arrayidx2 = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit to possibly help readability
%arrayidx2 = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 | |
%gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
%0 = load i32, ptr %arrayidx, align 4 | ||
%idxprom1 = zext i32 %0 to i64 | ||
%arrayidx2 = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 | ||
%1 = load i32, ptr %arrayidx2, align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit to possibly help readability
%1 = load i32, ptr %arrayidx2, align 4 | |
%l.bucket = load i32, ptr %arrayidx2, align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
LAI = &LAIs.getInfo(*TheLoop); | ||
const MemoryDepChecker::Dependence *IUDep = nullptr; | ||
const MemoryDepChecker &DepChecker = LAI->getDepChecker(); | ||
const auto *Deps = DepChecker.getDependences(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Deps
may be nullptr here; LAA only records dependences up to MaxDependences
(would be good to have a test using something like -max-dependences=2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, sve2-histcnt-too-many-deps.ll
// any extensions. | ||
// FIXME: Support indices from other sources that a linear load from memory? | ||
Value *VPtrVal; | ||
if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Load(m_Value(VPtrVal))))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have a tests where the index isn't extended? Coudlnt' find any in sve2-histcnt.ll but might have missed them
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added simple_histogram_64b
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks!
|
||
// Restrict address calculation to constant indices except for the last term. | ||
Value *HIdx = nullptr; | ||
for (Value *Index : GEP->indices()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are all cases exercised in tests? Looks like the tests are all using %arrayidx2 = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
? Should at least have some with additional constant indices?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had a test (histogram_array_3op_gep
) which exercised this with global arrays, requiring one extra constant index argument.
I've added a second (histogram_array_4op_gep_nonzero_const_idx
) with a struct as well, with one of the constant terms being nonzero.
Are there other cases you would like to see?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for adding the extra tests.
|
||
// Check that the index is calculated by loading from another array. Ignore | ||
// any extensions. | ||
// FIXME: Support indices from other sources that a linear load from memory? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// FIXME: Support indices from other sources that a linear load from memory? | |
// FIXME: Support indices from other sources than a linear load from memory? |
I assume the check isa<SCEVAddRecExpr>
below guards for this FIXME? Would be good to document this current restriction. Does the step of the SCEV matter? It could also be an add-rec for an other loop, which would pass this check but would be a uniform load?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done, added sve2-histcnt-outerloop-scevaddrec.ll
for a test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for adding the new test.
I might have missed it elsewhere, does the step of the AddRec below matter? Does the loop associated with the add rec matter? e.g. it may be an AddRec in an outer loop, then it won't be incremented on each iteration of the inner loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added a check+bailout for a loop-invariant stride below this point. The step doesn't matter beyond that I think. In theory we don't even need it to be an AddRec, but I'm being cautious for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like you've addressed Florian's remaining comments and I'm happy with the state it's in.
@@ -69,6 +69,7 @@ class LoopVectorizationCostModel; | |||
class LoopVersioning; | |||
|
|||
struct VPCostContext; | |||
struct HistogramInfo; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
might have missed it, but not used in latest version in the file?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
return false; | ||
|
||
if (!isa<SCEVAddRecExpr>(PSE.getSE()->getSCEV(VPtrVal)) || | ||
TheLoop->isLoopInvariant(VPtrVal)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better/clearer to check if the AddRec's loop is TheLoop? I think it may be possible for VPtrVal (the GEP) the be placed inside TheLoop, but still be an AddRec of the outer loop?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
/// of intrinsics. The only update operations currently supported are | ||
/// 'add' and 'sub' where the other term is loop-invariant. | ||
class VPHistogramRecipe : public VPRecipeBase { | ||
unsigned Opcode; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: document that this is the opcode of the increment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
// vec-of-pointers. | ||
assert(VF.isVector() && "Invalid VF for histogram cost"); | ||
Value *Address = getOperand(0)->getUnderlyingValue(); | ||
Value *IncAmt = getOperand(1)->getUnderlyingValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The operand may be replaced by a recipe without underlying value. To get the type, better to use Ctx.Types.inferScalarType
. There's also a ToVector helper to create a vector type
Same for using Address to get the type
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
// Assume that a non-constant update value (or a constant != 1) requires | ||
// a multiply, and add that into the cost. | ||
InstructionCost MulCost = TTI::TCC_Free; | ||
ConstantInt *RHS = dyn_cast<ConstantInt>(IncAmt); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There may be no underlying value for IncAmt
, better to check if it is a live in (isLiveIn) and use getLiveInIRValue if it is?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
1c63f05
to
031bb9e
Compare
Rebased, and fixed up a couple of test that had slightly different output. |
case Instruction::Sub: { | ||
auto Info = Legal->getHistogramInfo(I); | ||
if (Info && VF.isVector()) { | ||
const HistogramInfo *HGram = Info.value(); | ||
// Assume that a non-constant update value (or a constant != 1) requires | ||
// a multiply, and add that into the cost. | ||
InstructionCost MulCost = TTI::TCC_Free; | ||
ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1)); | ||
if (!RHS || RHS->getZExtValue() != 1) | ||
MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); | ||
|
||
// Find the cost of the histogram operation itself. | ||
Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF); | ||
Type *ScalarTy = I->getType(); | ||
Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF); | ||
IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, | ||
Type::getVoidTy(I->getContext()), | ||
{PtrTy, ScalarTy, MaskTy}); | ||
|
||
// Add the costs together with the add/sub operation. | ||
return TTI.getIntrinsicInstrCost( | ||
ICA, TargetTransformInfo::TCK_RecipThroughput) + | ||
MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); | ||
} | ||
[[fallthrough]]; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this still needed in the latest version with `::computeCost?
case Instruction::Sub: { | |
auto Info = Legal->getHistogramInfo(I); | |
if (Info && VF.isVector()) { | |
const HistogramInfo *HGram = Info.value(); | |
// Assume that a non-constant update value (or a constant != 1) requires | |
// a multiply, and add that into the cost. | |
InstructionCost MulCost = TTI::TCC_Free; | |
ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1)); | |
if (!RHS || RHS->getZExtValue() != 1) | |
MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); | |
// Find the cost of the histogram operation itself. | |
Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF); | |
Type *ScalarTy = I->getType(); | |
Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF); | |
IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, | |
Type::getVoidTy(I->getContext()), | |
{PtrTy, ScalarTy, MaskTy}); | |
// Add the costs together with the add/sub operation. | |
return TTI.getIntrinsicInstrCost( | |
ICA, TargetTransformInfo::TCK_RecipThroughput) + | |
MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); | |
} | |
[[fallthrough]]; | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, that's still needed for now -- the two cost models disagree and assert if I remove that code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for checking, I though the assert wouldn't trigger due to planContainsAdditionalSimplifications
.
I'll try committing this on Friday, unless there are any other comments? |
State.setDebugLocFrom(getDebugLoc()); | ||
IRBuilderBase &Builder = State.Builder; | ||
|
||
for (unsigned Part = 0; Part < State.UF; ++Part) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like the PR needs a rebase I think
// a multiply, and add that into the cost. | ||
Value *RHS = IncAmt->getUnderlyingValue(); | ||
// The underlying value may be null, check for a live-in if so. | ||
if (!RHS && IncAmt->isLiveIn()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is RHS only used to check if it is constant? It can only be constant if it is a live-in, so retrieving getUnderlyingValue first can be skipped?
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 | ||
; RUN: opt < %s -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -force-vector-width=2 -S | FileCheck %s | ||
|
||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this needed?
case Instruction::Sub: { | ||
auto Info = Legal->getHistogramInfo(I); | ||
if (Info && VF.isVector()) { | ||
const HistogramInfo *HGram = Info.value(); | ||
// Assume that a non-constant update value (or a constant != 1) requires | ||
// a multiply, and add that into the cost. | ||
InstructionCost MulCost = TTI::TCC_Free; | ||
ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1)); | ||
if (!RHS || RHS->getZExtValue() != 1) | ||
MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); | ||
|
||
// Find the cost of the histogram operation itself. | ||
Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF); | ||
Type *ScalarTy = I->getType(); | ||
Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF); | ||
IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, | ||
Type::getVoidTy(I->getContext()), | ||
{PtrTy, ScalarTy, MaskTy}); | ||
|
||
// Add the costs together with the add/sub operation. | ||
return TTI.getIntrinsicInstrCost( | ||
ICA, TargetTransformInfo::TCK_RecipThroughput) + | ||
MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); | ||
} | ||
[[fallthrough]]; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for checking, I though the assert wouldn't trigger due to planContainsAdditionalSimplifications
.
* Mask is optional for vplan recipe * Left load+update to DCE * Documentation for new recipe * Stored histogram info by reference * Allow GEPs with more than one index value, provided that all but the last are constant. * Cleanup to remove some unnecessary local variables. * Minor nits addressed.
* Improve test ir names * Explain current limitation on indices * Check index address is not loop-invariant (with test) * Check for null pointer on dependences (with test) * Remove HistogramInfo from recipe, remove legality header from VPlanRecipes.cpp * Implement overridden computeCost for VPHistogramRecipe * Separated VPlan checks from main test file, used regex matching * Added a 4-op GEP test featuring a struct * Added 64b histogram test without index extension * Added comment to target-generic test file explaining it doesn't currently vectorize
* Check AddRec Loop vs. current * Remove stray struct token declaration * Add comment on Opcode field of VPHistogramRecipe * Use Ctx.Types.inferScalarType when computing cost * Check for a LiveIn value if there isn't an underlying value for costing * Tolerate nulls for both underlying and LiveIn, just in case.
… histogram recipe
031bb9e
to
65d7ab9
Compare
Rebased, addressed comments, added support for epilogue vectorization now that the clone method has been implemented. Simplified the tests via flags where appropriate, updated the i8 test since the backend is now able to legalize it. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
return false; | ||
|
||
// Find a single IndirectUnsafe dependency. | ||
LAI = &LAIs.getInfo(*TheLoop); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this needed or can it be dropped? It should be initialized by canVectorizeMemory which is called before this one?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
Mask = Builder.CreateVectorSplat(VTy->getElementCount(), | ||
ConstantInt::getTrue(Builder.getInt1Ty())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mask = Builder.CreateVectorSplat(VTy->getElementCount(), | |
ConstantInt::getTrue(Builder.getInt1Ty())); | |
Mask = Builder.CreateVectorSplat(VTy->getElementCount(), | |
Builder.getInt1(1)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
Thanks for the thorough review :) |
This patch implements autovectorization support for the 'all-in-one' histogram intrinsic, which seems to have more support than the 'standalone' intrinsic. See https://discourse.llvm.org/t/rfc-vectorization-support-for-histogram-count-operations/74788/ for an overview of the work and my notes on the tradeoffs between the two approaches.
This patch implements autovectorization support for the 'all-in-one' histogram intrinsic, which seems to have more support than the 'standalone' intrinsic. See https://discourse.llvm.org/t/rfc-vectorization-support-for-histogram-count-operations/74788/ for an overview of the work and my notes on the tradeoffs between the two approaches.
This is a new PR after the previous one (#91458) was reverted at maintainer request.
I have removed the changes to LoopAccessAnalysis in favor continuing analysis in LoopVectorizationLegality.