Skip to content

Commit a10ce1a

Browse files
committed
WIP histogram autovec
Mostly functioning all-in-one intrinsic autovec
1 parent d82e056 commit a10ce1a

16 files changed

+386
-18
lines changed

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ class MemoryDepChecker {
198198
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
199199
const DenseMap<Value *, const SCEV *> &Strides,
200200
const DenseMap<Value *, SmallVector<const Value *, 16>>
201-
&UnderlyingObjects);
201+
&UnderlyingObjects,
202+
const SmallPtrSetImpl<const Value *> &HistogramPtrs);
202203

203204
/// No memory dependence was encountered that would inhibit
204205
/// vectorization.
@@ -330,7 +331,8 @@ class MemoryDepChecker {
330331
isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
331332
unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
332333
const DenseMap<Value *, SmallVector<const Value *, 16>>
333-
&UnderlyingObjects);
334+
&UnderlyingObjects,
335+
const SmallPtrSetImpl<const Value *> &HistogramPtrs);
334336

335337
/// Check whether the data dependence could prevent store-load
336338
/// forwarding.
@@ -394,6 +396,15 @@ struct PointerDiffInfo {
394396
NeedsFreeze(NeedsFreeze) {}
395397
};
396398

399+
struct HistogramInfo {
400+
Instruction *Load;
401+
Instruction *Update;
402+
Instruction *Store;
403+
404+
HistogramInfo(Instruction *Load, Instruction *Update, Instruction *Store)
405+
: Load(Load), Update(Update), Store(Store) {}
406+
};
407+
397408
/// Holds information about the memory runtime legality checks to verify
398409
/// that a group of pointers do not overlap.
399410
class RuntimePointerChecking {
@@ -612,6 +623,10 @@ class LoopAccessInfo {
612623
unsigned getNumStores() const { return NumStores; }
613624
unsigned getNumLoads() const { return NumLoads;}
614625

626+
const SmallVectorImpl<HistogramInfo> &getHistograms() const {
627+
return Histograms;
628+
}
629+
615630
/// The diagnostics report generated for the analysis. E.g. why we
616631
/// couldn't analyze the loop.
617632
const OptimizationRemarkAnalysis *getReport() const { return Report.get(); }
@@ -724,6 +739,13 @@ class LoopAccessInfo {
724739
/// If an access has a symbolic strides, this maps the pointer value to
725740
/// the stride symbol.
726741
DenseMap<Value *, const SCEV *> SymbolicStrides;
742+
743+
/// Holds the load, update, and store instructions for all histogram-style
744+
/// operations found in the loop.
745+
SmallVector<HistogramInfo, 2> Histograms;
746+
747+
/// Storing Histogram Pointers
748+
SmallPtrSet<const Value *, 2> HistogramPtrs;
727749
};
728750

729751
/// Return the SCEV corresponding to a pointer with the symbolic stride

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -982,6 +982,9 @@ class TargetTransformInfo {
982982
/// Return hardware support for population count.
983983
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
984984

985+
/// Returns the cost of generating a vector histogram.
986+
InstructionCost getHistogramCost(Type *Ty) const;
987+
985988
/// Return true if the hardware has a fast square-root instruction.
986989
bool haveFastSqrt(Type *Ty) const;
987990

@@ -1930,6 +1933,7 @@ class TargetTransformInfo::Concept {
19301933
unsigned *Fast) = 0;
19311934
virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
19321935
virtual bool haveFastSqrt(Type *Ty) = 0;
1936+
virtual InstructionCost getHistogramCost(Type *Ty) = 0;
19331937
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) = 0;
19341938
virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
19351939
virtual InstructionCost getFPOpCost(Type *Ty) = 0;
@@ -2490,6 +2494,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
24902494
}
24912495
bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
24922496

2497+
InstructionCost getHistogramCost(Type *Ty) override {
2498+
return Impl.getHistogramCost(Ty);
2499+
}
2500+
24932501
bool isExpensiveToSpeculativelyExecute(const Instruction* I) override {
24942502
return Impl.isExpensiveToSpeculativelyExecute(I);
24952503
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,10 @@ class TargetTransformInfoImplBase {
412412

413413
bool haveFastSqrt(Type *Ty) const { return false; }
414414

415+
InstructionCost getHistogramCost(Type *Ty) const {
416+
return InstructionCost::getInvalid();
417+
}
418+
415419
bool isExpensiveToSpeculativelyExecute(const Instruction *I) { return true; }
416420

417421
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return true; }

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
538538
TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
539539
}
540540

541+
InstructionCost getHistogramCost(Type *Ty) {
542+
return InstructionCost::getInvalid();
543+
}
544+
541545
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
542546
return true;
543547
}

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,23 @@ class LoopVectorizationLegality {
387387
unsigned getNumStores() const { return LAI->getNumStores(); }
388388
unsigned getNumLoads() const { return LAI->getNumLoads(); }
389389

390+
bool isHistogramLoadOrUpdate(Instruction *I) const {
391+
for (const HistogramInfo &HGram : LAI->getHistograms())
392+
if (HGram.Load == I || HGram.Update == I)
393+
return true;
394+
395+
return false;
396+
}
397+
398+
std::optional<const HistogramInfo *>
399+
getHistogramForStore(StoreInst *SI) const {
400+
for (const HistogramInfo &HGram : LAI->getHistograms())
401+
if (HGram.Store == SI)
402+
return &HGram;
403+
404+
return std::nullopt;
405+
}
406+
390407
PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
391408
return &PSE;
392409
}

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 134 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "llvm/ADT/SmallPtrSet.h"
2222
#include "llvm/ADT/SmallSet.h"
2323
#include "llvm/ADT/SmallVector.h"
24+
#include "llvm/ADT/Statistic.h"
2425
#include "llvm/Analysis/AliasAnalysis.h"
2526
#include "llvm/Analysis/AliasSetTracker.h"
2627
#include "llvm/Analysis/LoopAnalysisManager.h"
@@ -69,6 +70,8 @@ using namespace llvm::PatternMatch;
6970

7071
#define DEBUG_TYPE "loop-accesses"
7172

73+
STATISTIC(HistogramsDetected, "Number of Histograms detected");
74+
7275
static cl::opt<unsigned, true>
7376
VectorizationFactor("force-vector-width", cl::Hidden,
7477
cl::desc("Sets the SIMD width. Zero is autoselect."),
@@ -730,6 +733,23 @@ class AccessAnalysis {
730733
return UnderlyingObjects;
731734
}
732735

736+
/// Find Histogram counts that match high-level code in loops:
737+
/// \code
738+
/// buckets[indices[i]]+=step;
739+
/// \endcode
740+
///
741+
/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
742+
/// array the computed histogram. It uses a BinOp to sum all counts, storing
743+
/// them using a loop-variant index Load from the 'indices' input array.
744+
///
745+
/// On successful matches it updates the STATISTIC 'HistogramsDetected',
746+
/// regardless of hardware support. When there is support, it additionally
747+
/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
748+
/// used to update histogram in \p HistogramPtrs.
749+
void findHistograms(StoreInst *HSt,
750+
SmallVectorImpl<HistogramInfo> &Histograms,
751+
SmallPtrSetImpl<const Value *> &HistogramPtrs);
752+
733753
private:
734754
typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
735755

@@ -1947,7 +1967,8 @@ getDependenceDistanceStrideAndSize(
19471967
const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
19481968
const DenseMap<Value *, const SCEV *> &Strides,
19491969
const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
1950-
PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
1970+
PredicatedScalarEvolution &PSE, const Loop *InnermostLoop,
1971+
const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
19511972
auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
19521973
auto &SE = *PSE.getSE();
19531974
auto [APtr, AIsWrite] = A;
@@ -1965,6 +1986,15 @@ getDependenceDistanceStrideAndSize(
19651986
BPtr->getType()->getPointerAddressSpace())
19661987
return MemoryDepChecker::Dependence::Unknown;
19671988

1989+
// Ignore Histogram count updates as they are handled by the Intrinsic. This
1990+
// happens when the same pointer is first used to read from and then is used
1991+
// to write to.
1992+
if (!AIsWrite && BIsWrite && APtr == BPtr && HistogramPtrs.contains(APtr)) {
1993+
LLVM_DEBUG(dbgs() << "LAA: Histogram: Update is safely ignored. Pointer: "
1994+
<< *APtr);
1995+
return MemoryDepChecker::Dependence::NoDep;
1996+
}
1997+
19681998
int64_t StrideAPtr =
19691999
getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
19702000
int64_t StrideBPtr =
@@ -2018,15 +2048,15 @@ getDependenceDistanceStrideAndSize(
20182048
MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
20192049
const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
20202050
unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
2021-
const DenseMap<Value *, SmallVector<const Value *, 16>>
2022-
&UnderlyingObjects) {
2051+
const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
2052+
const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
20232053
assert(AIdx < BIdx && "Must pass arguments in program order");
20242054

20252055
// Get the dependence distance, stride, type size and what access writes for
20262056
// the dependence between A and B.
20272057
auto Res = getDependenceDistanceStrideAndSize(
20282058
A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
2029-
InnermostLoop);
2059+
InnermostLoop, HistogramPtrs);
20302060
if (std::holds_alternative<Dependence::DepType>(Res))
20312061
return std::get<Dependence::DepType>(Res);
20322062

@@ -2240,8 +2270,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
22402270
bool MemoryDepChecker::areDepsSafe(
22412271
DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
22422272
const DenseMap<Value *, const SCEV *> &Strides,
2243-
const DenseMap<Value *, SmallVector<const Value *, 16>>
2244-
&UnderlyingObjects) {
2273+
const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
2274+
const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
22452275

22462276
MinDepDistBytes = -1;
22472277
SmallPtrSet<MemAccessInfo, 8> Visited;
@@ -2286,7 +2316,7 @@ bool MemoryDepChecker::areDepsSafe(
22862316

22872317
Dependence::DepType Type =
22882318
isDependent(*A.first, A.second, *B.first, B.second, Strides,
2289-
UnderlyingObjects);
2319+
UnderlyingObjects, HistogramPtrs);
22902320
mergeInStatus(Dependence::isSafeForVectorization(Type));
22912321

22922322
// Gather dependences unless we accumulated MaxDependences
@@ -2622,6 +2652,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
26222652
// check.
26232653
Accesses.buildDependenceSets();
26242654

2655+
for (StoreInst *ST : Stores)
2656+
Accesses.findHistograms(ST, Histograms, HistogramPtrs);
2657+
26252658
// Find pointers with computable bounds. We are going to use this information
26262659
// to place a runtime bound check.
26272660
Value *UncomputablePtr = nullptr;
@@ -2646,7 +2679,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
26462679
LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
26472680
CanVecMem = DepChecker->areDepsSafe(
26482681
DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
2649-
Accesses.getUnderlyingObjects());
2682+
Accesses.getUnderlyingObjects(), HistogramPtrs);
26502683

26512684
if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
26522685
LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
@@ -3084,6 +3117,99 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
30843117
return *I.first->second;
30853118
}
30863119

3120+
void AccessAnalysis::findHistograms(
3121+
StoreInst *HSt, SmallVectorImpl<HistogramInfo> &Histograms,
3122+
SmallPtrSetImpl<const Value *> &HistogramPtrs) {
3123+
LLVM_DEBUG(dbgs() << "LAA: Attempting to match histogram from " << *HSt
3124+
<< "\n");
3125+
// Store value must come from a Binary Operation.
3126+
Instruction *HPtrInstr = nullptr;
3127+
BinaryOperator *HBinOp = nullptr;
3128+
if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr)))) {
3129+
LLVM_DEBUG(dbgs() << "\tNo BinOp\n");
3130+
return;
3131+
}
3132+
3133+
// BinOp must be an Add or a Sub operating modifying the bucket value by a
3134+
// loop invariant amount.
3135+
// FIXME: We assume the loop invariant term is on the RHS.
3136+
// Fine for an immediate/constant, but maybe not a generic value?
3137+
Value *HIncVal = nullptr;
3138+
if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
3139+
!match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal)))) {
3140+
LLVM_DEBUG(dbgs() << "\tNo matching load\n");
3141+
return;
3142+
}
3143+
Instruction *IndexedLoad = cast<Instruction>(HBinOp->getOperand(0));
3144+
3145+
// The address to store is calculated through a GEP Instruction.
3146+
// FIXME: Support GEPs with more operands.
3147+
GetElementPtrInst *HPtr = dyn_cast<GetElementPtrInst>(HPtrInstr);
3148+
if (!HPtr || HPtr->getNumOperands() > 2) {
3149+
LLVM_DEBUG(dbgs() << "\tToo many GEP operands\n");
3150+
return;
3151+
}
3152+
3153+
// Check that the index is calculated by loading from another array. Ignore
3154+
// any extensions.
3155+
// FIXME: Support indices from other sources that a linear load from memory?
3156+
Value *HIdx = HPtr->getOperand(1);
3157+
Instruction *IdxInst = nullptr;
3158+
// FIXME: Can this fail? Maybe if IdxInst isn't an instruction. Just need to
3159+
// look through extensions, find another way?
3160+
if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Instruction(IdxInst))))
3161+
return;
3162+
3163+
// Currently restricting this to linear addressing when loading indices.
3164+
LoadInst *VLoad = dyn_cast<LoadInst>(IdxInst);
3165+
Value *VPtrVal;
3166+
if (!VLoad || !match(VLoad, m_Load(m_Value(VPtrVal)))) {
3167+
LLVM_DEBUG(dbgs() << "\tBad Index Load\n");
3168+
return;
3169+
}
3170+
3171+
if (!isa<SCEVAddRecExpr>(PSE.getSCEV(VPtrVal))) {
3172+
LLVM_DEBUG(dbgs() << "\tCannot determine index load stride\n");
3173+
return;
3174+
}
3175+
3176+
// FIXME: support smaller types of input arrays. Integers can be promoted
3177+
// for codegen.
3178+
Type *VLoadTy = VLoad->getType();
3179+
if (!VLoadTy->isIntegerTy() || (VLoadTy->getScalarSizeInBits() != 32 &&
3180+
VLoadTy->getScalarSizeInBits() != 64)) {
3181+
LLVM_DEBUG(dbgs() << "\tUnsupported bucket type: " << *VLoadTy << "\n");
3182+
return;
3183+
}
3184+
3185+
// Ensure we'll have the same mask by checking that all parts of the histogram
3186+
// are in the same block.
3187+
// FIXME: Could use dominance checks instead?
3188+
if (IndexedLoad->getParent() != HBinOp->getParent() ||
3189+
IndexedLoad->getParent() != HSt->getParent()) {
3190+
LLVM_DEBUG(dbgs() << "\tDifferent parent blocks\n");
3191+
return;
3192+
}
3193+
3194+
// A histogram pointer may only alias to itself, and must only have two uses,
3195+
// the load and the store.
3196+
for (AliasSet &AS : AST)
3197+
if (AS.isMustAlias() || AS.isMayAlias())
3198+
if ((is_contained(AS.getPointers(), HPtr) && AS.size() > 1) ||
3199+
HPtr->getNumUses() != 2) {
3200+
LLVM_DEBUG(dbgs() << "\tAliasing problem\n");
3201+
return;
3202+
}
3203+
3204+
LLVM_DEBUG(dbgs() << "LAA: Found Histogram Operation: " << *HBinOp << "\n");
3205+
HistogramsDetected++;
3206+
3207+
// Store the operations that make up the histogram.
3208+
Histograms.emplace_back(IndexedLoad, HBinOp, HSt);
3209+
// Store pointers used to write those counts in the computed histogram.
3210+
HistogramPtrs.insert(HPtr);
3211+
}
3212+
30873213
bool LoopAccessInfoManager::invalidate(
30883214
Function &F, const PreservedAnalyses &PA,
30893215
FunctionAnalysisManager::Invalidator &Inv) {

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,10 @@ bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
653653
return TTIImpl->haveFastSqrt(Ty);
654654
}
655655

656+
InstructionCost TargetTransformInfo::getHistogramCost(Type *Ty) const {
657+
return TTIImpl->getHistogramCost(Ty);
658+
}
659+
656660
bool TargetTransformInfo::isExpensiveToSpeculativelyExecute(
657661
const Instruction *I) const {
658662
return TTIImpl->isExpensiveToSpeculativelyExecute(I);

0 commit comments

Comments
 (0)