Skip to content

Commit f1d84a2

Browse files
committed
LV Changes
1 parent 658e894 commit f1d84a2

File tree

8 files changed

+583
-25
lines changed

8 files changed

+583
-25
lines changed

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,18 @@ class LoopVectorizationRequirements {
224224
Instruction *ExactFPMathInst = nullptr;
225225
};
226226

227+
/// This holds details about a histogram operation -- a load -> update -> store
228+
/// sequence where each lane in a vector might be updating the same element as
229+
/// another lane.
230+
struct HistogramInfo {
231+
LoadInst *Load;
232+
Instruction *Update;
233+
StoreInst *Store;
234+
235+
HistogramInfo(LoadInst *Load, Instruction *Update, StoreInst *Store)
236+
: Load(Load), Update(Update), Store(Store) {}
237+
};
238+
227239
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
228240
/// to what vectorization factor.
229241
/// This class does not look at the profitability of vectorization, only the
@@ -390,6 +402,22 @@ class LoopVectorizationLegality {
390402
unsigned getNumStores() const { return LAI->getNumStores(); }
391403
unsigned getNumLoads() const { return LAI->getNumLoads(); }
392404

405+
/// Returns a HistogramInfo* for the given instruction if it was determined
406+
/// to be part of a load -> update -> store sequence where multiple lanes
407+
/// may be working on the same memory address.
408+
std::optional<const HistogramInfo *> getHistogramInfo(Instruction *I) const {
409+
for (const HistogramInfo &HGram : Histograms)
410+
if (HGram.Load == I || HGram.Update == I || HGram.Store == I)
411+
return &HGram;
412+
413+
return std::nullopt;
414+
}
415+
416+
/// Returns a list of all known histogram operations in the loop.
417+
const SmallVectorImpl<HistogramInfo> &getHistograms() const {
418+
return Histograms;
419+
}
420+
393421
PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
394422
return &PSE;
395423
}
@@ -438,6 +466,11 @@ class LoopVectorizationLegality {
438466
/// Returns true if the loop is vectorizable
439467
bool canVectorizeMemory();
440468

469+
/// If LAA cannot determine whether all dependences are safe, we may be able
470+
/// to further analyse some unknown dependences and if they match a certain
471+
/// pattern (like a histogram) then we may still be able to vectorize.
472+
bool canVectorizeUnknownDependences();
473+
441474
/// Return true if we can vectorize this loop using the IF-conversion
442475
/// transformation.
443476
bool canVectorizeWithIfConvert();
@@ -542,6 +575,11 @@ class LoopVectorizationLegality {
542575
/// conditional assumes.
543576
SmallPtrSet<const Instruction *, 8> MaskedOp;
544577

578+
/// Contains all identified histogram operations, which are sequences of
579+
/// load -> update -> store instructions where multiple lanes in a vector
580+
/// may work on the same memory location.
581+
SmallVector<HistogramInfo, 1> Histograms;
582+
545583
/// BFI and PSI are used to check for profile guided size optimizations.
546584
BlockFrequencyInfo *BFI;
547585
ProfileSummaryInfo *PSI;

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
7878
"Scalable vectorization is available and favored when the "
7979
"cost is inconclusive.")));
8080

81+
static cl::opt<bool> EnableHistogramVectorization(
82+
"enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
83+
cl::desc("Enables autovectorization of some loops containing histograms"));
84+
8185
/// Maximum vectorization interleave count.
8286
static const unsigned MaxInterleaveFactor = 16;
8387

@@ -1054,6 +1058,110 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
10541058
return true;
10551059
}
10561060

1061+
/// Find Histogram counts that match high-level code in loops:
1062+
/// \code
1063+
/// buckets[indices[i]]+=step;
1064+
/// \endcode
1065+
///
1066+
/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
1067+
/// array the computed histogram. It uses a BinOp to sum all counts, storing
1068+
/// them using a loop-variant index Load from the 'indices' input array.
1069+
///
1070+
/// On successful matches it updates the STATISTIC 'HistogramsDetected',
1071+
/// regardless of hardware support. When there is support, it additionally
1072+
/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
1073+
/// used to update histogram in \p HistogramPtrs.
1074+
1075+
static bool findHistograms(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
1076+
const PredicatedScalarEvolution &PSE,
1077+
SmallVectorImpl<HistogramInfo> &Histograms) {
1078+
1079+
// Store value must come from a Binary Operation.
1080+
Instruction *HPtrInstr = nullptr;
1081+
BinaryOperator *HBinOp = nullptr;
1082+
if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr))))
1083+
return false;
1084+
1085+
// BinOp must be an Add or a Sub modifying the bucket value by a
1086+
// loop invariant amount.
1087+
// FIXME: We assume the loop invariant term is on the RHS.
1088+
// Fine for an immediate/constant, but maybe not a generic value?
1089+
Value *HIncVal = nullptr;
1090+
if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
1091+
!match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))))
1092+
return false;
1093+
1094+
// Make sure the increment value is loop invariant.
1095+
if (!TheLoop->isLoopInvariant(HIncVal))
1096+
return false;
1097+
1098+
// The address to store is calculated through a GEP Instruction.
1099+
// FIXME: Support GEPs with more operands.
1100+
GetElementPtrInst *HPtr = dyn_cast<GetElementPtrInst>(HPtrInstr);
1101+
if (!HPtr || HPtr->getNumOperands() > 2)
1102+
return false;
1103+
1104+
// Check that the index is calculated by loading from another array. Ignore
1105+
// any extensions.
1106+
// FIXME: Support indices from other sources that a linear load from memory?
1107+
Value *HIdx = HPtr->getOperand(1);
1108+
Instruction *IdxInst = nullptr;
1109+
if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Instruction(IdxInst))))
1110+
return false;
1111+
1112+
// Currently restricting this to linear addressing when loading indices.
1113+
LoadInst *VLoad = dyn_cast<LoadInst>(IdxInst);
1114+
Value *VPtrVal;
1115+
if (!VLoad || !match(VLoad, m_Load(m_Value(VPtrVal))))
1116+
return false;
1117+
1118+
if (!isa<SCEVAddRecExpr>(PSE.getSE()->getSCEV(VPtrVal)))
1119+
return false;
1120+
1121+
// Ensure we'll have the same mask by checking that all parts of the histogram
1122+
// (gather load, update, scatter store) are in the same block.
1123+
LoadInst *IndexedLoad = cast<LoadInst>(HBinOp->getOperand(0));
1124+
BasicBlock *LdBB = IndexedLoad->getParent();
1125+
if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent())
1126+
return false;
1127+
1128+
LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
1129+
1130+
// Store the operations that make up the histogram.
1131+
Histograms.emplace_back(IndexedLoad, HBinOp, HSt);
1132+
return true;
1133+
}
1134+
1135+
bool LoopVectorizationLegality::canVectorizeUnknownDependences() {
1136+
// For now, we only support an unknown dependency that calculates a histogram
1137+
if (!EnableHistogramVectorization)
1138+
return false;
1139+
1140+
// FIXME: Support more than one unknown dependence, and check to see if some
1141+
// are handled by runtime checks before looking for histograms.
1142+
LAI = &LAIs.getInfo(*TheLoop);
1143+
const MemoryDepChecker &DepChecker = LAI->getDepChecker();
1144+
const auto *Deps = DepChecker.getDependences();
1145+
if (!Deps || Deps->size() > 1)
1146+
return false;
1147+
1148+
const MemoryDepChecker::Dependence &Dep = (*Deps).front();
1149+
1150+
// We're only interested in unknown dependences.
1151+
if (Dep.Type != MemoryDepChecker::Dependence::Unknown)
1152+
return false;
1153+
1154+
// For now only normal loads and stores are supported.
1155+
LoadInst *LI = dyn_cast<LoadInst>(Dep.getSource(DepChecker));
1156+
StoreInst *SI = dyn_cast<StoreInst>(Dep.getDestination(DepChecker));
1157+
1158+
if (!LI || !SI)
1159+
return false;
1160+
1161+
LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n");
1162+
return findHistograms(LI, SI, TheLoop, LAI->getPSE(), Histograms);
1163+
}
1164+
10571165
bool LoopVectorizationLegality::canVectorizeMemory() {
10581166
LAI = &LAIs.getInfo(*TheLoop);
10591167
const OptimizationRemarkAnalysis *LAR = LAI->getReport();
@@ -1065,7 +1173,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
10651173
}
10661174

10671175
if (!LAI->canVectorizeMemory())
1068-
return false;
1176+
return canVectorizeUnknownDependences();
10691177

10701178
if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
10711179
reportVectorizationFailure("We don't allow storing to uniform addresses",

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4622,6 +4622,10 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
46224622
if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
46234623
return false;
46244624

4625+
// Loops containing histograms are not currently supported.
4626+
if (!Legal->getHistograms().empty())
4627+
return false;
4628+
46254629
return true;
46264630
}
46274631

@@ -6465,8 +6469,33 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
64656469
// We've proven all lanes safe to speculate, fall through.
64666470
[[fallthrough]];
64676471
case Instruction::Add:
6472+
case Instruction::Sub: {
6473+
auto Info = Legal->getHistogramInfo(I);
6474+
if (Info && VF.isVector()) {
6475+
const HistogramInfo *HGram = Info.value();
6476+
// Assume that a non-constant update value (or a constant != 1) requires
6477+
// a multiply, and add that into the cost.
6478+
InstructionCost MulCost = TTI::TCC_Free;
6479+
ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6480+
if (!RHS || RHS->getZExtValue() != 1)
6481+
MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
6482+
6483+
// Find the cost of the histogram operation itself.
6484+
Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6485+
Type *ScalarTy = I->getType();
6486+
Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6487+
IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6488+
Type::getVoidTy(I->getContext()),
6489+
{PtrTy, ScalarTy, MaskTy});
6490+
6491+
// Add the costs together with the add/sub operation.
6492+
return TTI.getIntrinsicInstrCost(
6493+
ICA, TargetTransformInfo::TCK_RecipThroughput) +
6494+
MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
6495+
}
6496+
[[fallthrough]];
6497+
}
64686498
case Instruction::FAdd:
6469-
case Instruction::Sub:
64706499
case Instruction::FSub:
64716500
case Instruction::Mul:
64726501
case Instruction::FMul:
@@ -8173,6 +8202,36 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
81738202
};
81748203
}
81758204

8205+
VPHistogramRecipe *
8206+
VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8207+
ArrayRef<VPValue *> Operands) {
8208+
// FIXME: Support other operations.
8209+
unsigned Opcode = HI->Update->getOpcode();
8210+
assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8211+
"Histogram update operation must be an Add or Sub");
8212+
8213+
SmallVector<VPValue *, 3> HGramOps;
8214+
// Bucket address.
8215+
HGramOps.push_back(Operands[1]);
8216+
// Increment value.
8217+
HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1), Plan));
8218+
8219+
// In case of predicated execution (due to tail-folding, or conditional
8220+
// execution, or both), pass the relevant mask. When there is no such mask,
8221+
// generate an all-true mask.
8222+
VPValue *Mask = nullptr;
8223+
if (Legal->isMaskRequired(HI->Store))
8224+
Mask = getBlockInMask(HI->Store->getParent());
8225+
else
8226+
Mask = Plan.getOrAddLiveIn(
8227+
ConstantInt::getTrue(IntegerType::getInt1Ty(HI->Load->getContext())));
8228+
HGramOps.push_back(Mask);
8229+
8230+
return new VPHistogramRecipe(HI, Opcode,
8231+
make_range(HGramOps.begin(), HGramOps.end()),
8232+
HI->Store->getDebugLoc());
8233+
}
8234+
81768235
void VPRecipeBuilder::fixHeaderPhis() {
81778236
BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
81788237
for (VPHeaderPHIRecipe *R : PhisToFix) {
@@ -8296,6 +8355,10 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
82968355
if (auto *CI = dyn_cast<CallInst>(Instr))
82978356
return tryToWidenCall(CI, Operands, Range);
82988357

8358+
if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8359+
if (auto HistInfo = Legal->getHistogramInfo(SI))
8360+
return tryToWidenHistogram(*HistInfo, Operands);
8361+
82998362
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
83008363
return tryToWidenMemory(Instr, Operands, Range);
83018364

@@ -8563,6 +8626,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
85638626
Operands = {OpRange.begin(), OpRange.end()};
85648627
}
85658628

8629+
// If this is a load instruction or a binop associated with a histogram,
8630+
// leave it until the store instruction to emit a combined intrinsic.
8631+
// Note that if the initial VF is scalar, we need to generate the normal
8632+
// clone recipe for these instructions. A histogram recipe will only be
8633+
// generated when minVF > 1.
8634+
if (Legal->getHistogramInfo(Instr) && !isa<StoreInst>(Instr) &&
8635+
!Range.Start.isScalar())
8636+
continue;
8637+
85668638
// Invariant stores inside loop will be deleted and a single store
85678639
// with the final reduction value will be added to the exit block
85688640
StoreInst *SI;
@@ -9890,6 +9962,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
98909962
InterleaveLoop = false;
98919963
}
98929964

9965+
// If there is a histogram in the loop, do not just interleave without
9966+
// vectorizing. The order of operations will be incorrect without the
9967+
// histogram intrinsics, which are only used for recipes with VF > 1.
9968+
if (!VectorizeLoop && InterleaveLoop && !LVL.getHistograms().empty()) {
9969+
LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
9970+
<< "to histogram operations.\n");
9971+
IntDiagMsg = std::make_pair(
9972+
"HistogramPreventsScalarInterleaving",
9973+
"Unable to interleave without vectorization due to constraints on "
9974+
"the order of histogram operations");
9975+
InterleaveLoop = false;
9976+
}
9977+
98939978
// Override IC if user provided an interleave count.
98949979
IC = UserIC > 0 ? UserIC : IC;
98959980

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ namespace llvm {
2020
class LoopVectorizationLegality;
2121
class LoopVectorizationCostModel;
2222
class TargetLibraryInfo;
23+
struct HistogramInfo;
2324

2425
/// Helper class to create VPRecipies from IR instructions.
2526
class VPRecipeBuilder {
@@ -102,6 +103,13 @@ class VPRecipeBuilder {
102103
VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
103104
VPBasicBlock *VPBB);
104105

106+
/// Makes Histogram count operations safe for vectorization, by emitting a
107+
/// llvm.experimental.vector.histogram.add intrinsic in place of the
108+
/// Load + Add|Sub + Store operations that perform the histogram in the
109+
/// original scalar loop.
110+
VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
111+
ArrayRef<VPValue *> Operands);
112+
105113
public:
106114
VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
107115
LoopVectorizationLegality *Legal,

0 commit comments

Comments
 (0)