Skip to content

Commit a965c25

Browse files
committed
[LV] Vectorize simple histograms
This patch introduces the ability to autovectorize loops containing a histogram operation; that is, * load from non-contiguous, possibly overlapping addresses * update with a loop-invariant value * store back to the same addresses An example: void simple_histogram(int *restrict buckets, unsigned *indices, int N) { for (int i = 0; i < N; ++i) buckets[indices[i]]++; } For this initial variant, we're fairly conservative and don't allow additional uses of the loaded values, and only support add/sub of integers. This uses the recently committed histogram intrinsic.
1 parent eecc936 commit a965c25

File tree

17 files changed

+465
-40
lines changed

17 files changed

+465
-40
lines changed

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,9 @@ class MemoryDepChecker {
144144
// on MinDepDistBytes.
145145
BackwardVectorizable,
146146
// Same, but may prevent store-to-load forwarding.
147-
BackwardVectorizableButPreventsForwarding
147+
BackwardVectorizableButPreventsForwarding,
148+
// Access is to a loop loaded value, but is part of a histogram operation.
149+
Histogram
148150
};
149151

150152
/// String version of the types.
@@ -201,7 +203,8 @@ class MemoryDepChecker {
201203
/// Only checks sets with elements in \p CheckDeps.
202204
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
203205
const DenseMap<Value *, SmallVector<const Value *, 16>>
204-
&UnderlyingObjects);
206+
&UnderlyingObjects,
207+
const SmallPtrSetImpl<const Value *> &HistogramPtrs);
205208

206209
/// No memory dependence was encountered that would inhibit
207210
/// vectorization.
@@ -343,7 +346,8 @@ class MemoryDepChecker {
343346
isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
344347
unsigned BIdx,
345348
const DenseMap<Value *, SmallVector<const Value *, 16>>
346-
&UnderlyingObjects);
349+
&UnderlyingObjects,
350+
const SmallPtrSetImpl<const Value *> &HistogramPtrs);
347351

348352
/// Check whether the data dependence could prevent store-load
349353
/// forwarding.
@@ -384,7 +388,8 @@ class MemoryDepChecker {
384388
const MemAccessInfo &A, Instruction *AInst, const MemAccessInfo &B,
385389
Instruction *BInst,
386390
const DenseMap<Value *, SmallVector<const Value *, 16>>
387-
&UnderlyingObjects);
391+
&UnderlyingObjects,
392+
const SmallPtrSetImpl<const Value *> &HistogramPtrs);
388393
};
389394

390395
class RuntimePointerChecking;
@@ -436,6 +441,15 @@ struct PointerDiffInfo {
436441
NeedsFreeze(NeedsFreeze) {}
437442
};
438443

444+
struct HistogramInfo {
445+
Instruction *Load;
446+
Instruction *Update;
447+
Instruction *Store;
448+
449+
HistogramInfo(Instruction *Load, Instruction *Update, Instruction *Store)
450+
: Load(Load), Update(Update), Store(Store) {}
451+
};
452+
439453
/// Holds information about the memory runtime legality checks to verify
440454
/// that a group of pointers do not overlap.
441455
class RuntimePointerChecking {
@@ -655,6 +669,10 @@ class LoopAccessInfo {
655669
unsigned getNumStores() const { return NumStores; }
656670
unsigned getNumLoads() const { return NumLoads;}
657671

672+
const SmallVectorImpl<HistogramInfo> &getHistograms() const {
673+
return Histograms;
674+
}
675+
658676
/// The diagnostics report generated for the analysis. E.g. why we
659677
/// couldn't analyze the loop.
660678
const OptimizationRemarkAnalysis *getReport() const { return Report.get(); }
@@ -768,6 +786,13 @@ class LoopAccessInfo {
768786
/// If an access has a symbolic strides, this maps the pointer value to
769787
/// the stride symbol.
770788
DenseMap<Value *, const SCEV *> SymbolicStrides;
789+
790+
/// Holds the load, update, and store instructions for all histogram-style
791+
/// operations found in the loop.
792+
SmallVector<HistogramInfo, 2> Histograms;
793+
794+
/// Storing Histogram Pointers
795+
SmallPtrSet<const Value *, 2> HistogramPtrs;
771796
};
772797

773798
/// Return the SCEV corresponding to a pointer with the symbolic stride

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,9 @@ class TargetTransformInfo {
989989
/// Return hardware support for population count.
990990
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
991991

992+
/// Returns the cost of generating a vector histogram.
993+
InstructionCost getHistogramCost(Type *Ty) const;
994+
992995
/// Return true if the hardware has a fast square-root instruction.
993996
bool haveFastSqrt(Type *Ty) const;
994997

@@ -1939,6 +1942,7 @@ class TargetTransformInfo::Concept {
19391942
unsigned *Fast) = 0;
19401943
virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
19411944
virtual bool haveFastSqrt(Type *Ty) = 0;
1945+
virtual InstructionCost getHistogramCost(Type *Ty) = 0;
19421946
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) = 0;
19431947
virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
19441948
virtual InstructionCost getFPOpCost(Type *Ty) = 0;
@@ -2505,6 +2509,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
25052509
}
25062510
bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
25072511

2512+
InstructionCost getHistogramCost(Type *Ty) override {
2513+
return Impl.getHistogramCost(Ty);
2514+
}
2515+
25082516
bool isExpensiveToSpeculativelyExecute(const Instruction* I) override {
25092517
return Impl.isExpensiveToSpeculativelyExecute(I);
25102518
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,10 @@ class TargetTransformInfoImplBase {
420420

421421
bool haveFastSqrt(Type *Ty) const { return false; }
422422

423+
InstructionCost getHistogramCost(Type *Ty) const {
424+
return InstructionCost::getInvalid();
425+
}
426+
423427
bool isExpensiveToSpeculativelyExecute(const Instruction *I) { return true; }
424428

425429
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return true; }

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
544544
TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
545545
}
546546

547+
InstructionCost getHistogramCost(Type *Ty) {
548+
return InstructionCost::getInvalid();
549+
}
550+
547551
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
548552
return true;
549553
}

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,23 @@ class LoopVectorizationLegality {
387387
unsigned getNumStores() const { return LAI->getNumStores(); }
388388
unsigned getNumLoads() const { return LAI->getNumLoads(); }
389389

390+
bool isHistogramLoadOrUpdate(Instruction *I) const {
391+
for (const HistogramInfo &HGram : LAI->getHistograms())
392+
if (HGram.Load == I || HGram.Update == I)
393+
return true;
394+
395+
return false;
396+
}
397+
398+
std::optional<const HistogramInfo *>
399+
getHistogramForStore(StoreInst *SI) const {
400+
for (const HistogramInfo &HGram : LAI->getHistograms())
401+
if (HGram.Store == SI)
402+
return &HGram;
403+
404+
return std::nullopt;
405+
}
406+
390407
PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
391408
return &PSE;
392409
}

0 commit comments

Comments
 (0)