Skip to content

Commit 131acee

Browse files
committed
Addressing review comments:
* Improve test ir names * Explain current limitation on indices * Check index address is not loop-invariant (with test) * Check for null pointer on dependences (with test) * Remove HistogramInfo from recipe, remove legality header from VPlanRecipes.cpp * Implement overridden computeCost for VPHistogramRecipe * Separated VPlan checks from main test file, used regex matching * Added a 4-op GEP test featuring a struct * Added 64b histogram test without index extension * Added comment to target-generic test file explaining it doesn't currently vectorize
1 parent 7d4e387 commit 131acee

11 files changed

+614
-324
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,12 +1110,17 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
11101110

11111111
// Check that the index is calculated by loading from another array. Ignore
11121112
// any extensions.
1113-
// FIXME: Support indices from other sources that a linear load from memory?
1113+
// FIXME: Support indices from other sources than a linear load from memory?
1114+
// We're currently trying to match an operation looping over an array
1115+
// of indices, but there could be additional levels of indirection
1116+
// in place, or possibly some additional calculation to form the index
1117+
// from the loaded data.
11141118
Value *VPtrVal;
11151119
if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Load(m_Value(VPtrVal)))))
11161120
return false;
11171121

1118-
if (!isa<SCEVAddRecExpr>(PSE.getSE()->getSCEV(VPtrVal)))
1122+
if (!isa<SCEVAddRecExpr>(PSE.getSE()->getSCEV(VPtrVal)) ||
1123+
TheLoop->isLoopInvariant(VPtrVal))
11191124
return false;
11201125

11211126
// Ensure we'll have the same mask by checking that all parts of the histogram
@@ -1143,6 +1148,11 @@ bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
11431148
const MemoryDepChecker::Dependence *IUDep = nullptr;
11441149
const MemoryDepChecker &DepChecker = LAI->getDepChecker();
11451150
const auto *Deps = DepChecker.getDependences();
1151+
// If there were too many dependences, LAA abandons recording them. We can't
1152+
// proceed safely if we don't know what the dependences are.
1153+
if (!Deps)
1154+
return false;
1155+
11461156
for (const MemoryDepChecker::Dependence &Dep : *Deps) {
11471157
// Ignore dependencies that are either known to be safe or can be
11481158
// checked at runtime.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8477,7 +8477,7 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
84778477
if (Legal->isMaskRequired(HI->Store))
84788478
HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
84798479

8480-
return new VPHistogramRecipe(*HI, Opcode,
8480+
return new VPHistogramRecipe(Opcode,
84818481
make_range(HGramOps.begin(), HGramOps.end()),
84828482
HI->Store->getDebugLoc());
84838483
}

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1698,15 +1698,13 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
16981698
/// of intrinsics. The only update operations currently supported are
16991699
/// 'add' and 'sub' where the other term is loop-invariant.
17001700
class VPHistogramRecipe : public VPRecipeBase {
1701-
const HistogramInfo &Info;
17021701
unsigned Opcode;
17031702

17041703
public:
17051704
template <typename IterT>
1706-
VPHistogramRecipe(const HistogramInfo &HI, unsigned Opcode,
1707-
iterator_range<IterT> Operands, DebugLoc DL = {})
1708-
: VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Info(HI),
1709-
Opcode(Opcode) {}
1705+
VPHistogramRecipe(unsigned Opcode, iterator_range<IterT> Operands,
1706+
DebugLoc DL = {})
1707+
: VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {}
17101708

17111709
~VPHistogramRecipe() override = default;
17121710

@@ -1719,9 +1717,11 @@ class VPHistogramRecipe : public VPRecipeBase {
17191717
/// Produce a vectorized histogram operation.
17201718
void execute(VPTransformState &State) override;
17211719

1722-
unsigned getOpcode() const { return Opcode; }
1720+
/// Return the cost of this VPHistogramRecipe.
1721+
InstructionCost computeCost(ElementCount VF,
1722+
VPCostContext &Ctx) const override;
17231723

1724-
const HistogramInfo &getHistogramInfo() const { return Info; }
1724+
unsigned getOpcode() const { return Opcode; }
17251725

17261726
/// Return the mask operand if one was provided, or a null pointer if all
17271727
/// lanes should be executed unconditionally.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
3434
#include "llvm/Transforms/Utils/LoopUtils.h"
3535
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
36-
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
3736
#include <cassert>
3837

3938
using namespace llvm;
@@ -286,10 +285,6 @@ static Instruction *getInstructionForCost(const VPRecipeBase *R) {
286285
// the legacy model.
287286
if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
288287
return &WidenMem->getIngredient();
289-
// FIXME: Override the cost method properly to take gather/scatter cost
290-
// into account, instead of just the intrinsic via the legacy model.
291-
if (auto *HG = dyn_cast<VPHistogramRecipe>(R))
292-
return HG->getHistogramInfo().Update;
293288
return nullptr;
294289
}
295290

@@ -1087,6 +1082,39 @@ void VPHistogramRecipe::execute(VPTransformState &State) {
10871082
}
10881083
}
10891084

1085+
InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1086+
VPCostContext &Ctx) const {
1087+
// FIXME: Take the gather and scatter into account as well. For now we're
1088+
// generating the same cost as the fallback path, but we'll likely
1089+
// need to create a new TTI method for determining the cost, including
1090+
// whether we can use base + vec-of-smaller-indices or just
1091+
// vec-of-pointers.
1092+
assert(VF.isVector() && "Invalid VF for histogram cost");
1093+
Value *Address = getOperand(0)->getUnderlyingValue();
1094+
Value *IncAmt = getOperand(1)->getUnderlyingValue();
1095+
Type *IncTy = IncAmt->getType();
1096+
VectorType *VTy = VectorType::get(IncTy, VF);
1097+
1098+
// Assume that a non-constant update value (or a constant != 1) requires
1099+
// a multiply, and add that into the cost.
1100+
InstructionCost MulCost = TTI::TCC_Free;
1101+
ConstantInt *RHS = dyn_cast<ConstantInt>(IncAmt);
1102+
if (!RHS || RHS->getZExtValue() != 1)
1103+
MulCost = Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy);
1104+
1105+
// Find the cost of the histogram operation itself.
1106+
Type *PtrTy = VectorType::get(Address->getType(), VF);
1107+
Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1108+
IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1109+
Type::getVoidTy(Ctx.LLVMCtx),
1110+
{PtrTy, IncTy, MaskTy});
1111+
1112+
// Add the costs together with the add/sub operation.
1113+
return Ctx.TTI.getIntrinsicInstrCost(
1114+
ICA, TargetTransformInfo::TCK_RecipThroughput) +
1115+
MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy);
1116+
}
1117+
10901118
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
10911119
void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
10921120
VPSlotTracker &SlotTracker) const {

llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,13 @@ entry:
5959

6060
for.body:
6161
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
62-
%arrayidx = getelementptr inbounds i32, ptr %indices, i64 %iv
63-
%0 = load i32, ptr %arrayidx, align 4
64-
%idxprom1 = zext i32 %0 to i64
65-
%arrayidx2 = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
66-
%1 = load i32, ptr %arrayidx2, align 4
67-
%inc = add nsw i32 %1, 1
68-
store i32 %inc, ptr %arrayidx2, align 4
62+
%gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
63+
%l.idx = load i32, ptr %gep.indices, align 4
64+
%idxprom1 = zext i32 %l.idx to i64
65+
%gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
66+
%l.bucket = load i32, ptr %gep.bucket, align 4
67+
%inc = add nsw i32 %l.bucket, 1
68+
store i32 %inc, ptr %gep.bucket, align 4
6969
%iv.next = add nuw nsw i64 %iv, 1
7070
%exitcond = icmp eq i64 %iv.next, %N
7171
br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !0

llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-no-scalar-interleave.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,13 @@ entry:
3232

3333
for.body:
3434
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
35-
%arrayidx = getelementptr inbounds i32, ptr %indices, i64 %iv
36-
%0 = load i32, ptr %arrayidx, align 4
37-
%idxprom1 = zext i32 %0 to i64
38-
%arrayidx2 = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
39-
%1 = load i32, ptr %arrayidx2, align 4
40-
%inc = add nsw i32 %1, 1
41-
store i32 %inc, ptr %arrayidx2, align 4
35+
%gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
36+
%l.idx = load i32, ptr %gep.indices, align 4
37+
%idxprom1 = zext i32 %l.idx to i64
38+
%gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
39+
%l.bucket = load i32, ptr %gep.bucket, align 4
40+
%inc = add nsw i32 %l.bucket, 1
41+
store i32 %inc, ptr %gep.bucket, align 4
4242
%iv.next = add nuw nsw i64 %iv, 1
4343
%exitcond = icmp eq i64 %iv.next, %N
4444
br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !0
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
; RUN: opt < %s -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
2+
; REQUIRES: asserts
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
;; Make sure we don't detect a histogram operation if the index address is
7+
;; loop invariant.
8+
; CHECK: LV: Checking for a histogram on: store i32 %inc, ptr %gep.bucket, align 4
9+
; CHECK-NEXT: LV: Can't vectorize due to memory conflicts
10+
; CHECK-NEXT: LV: Not vectorizing: Cannot prove legality.
11+
12+
define void @outer_loop_scevaddrec(ptr noalias %buckets, ptr readonly %indices, i64 %N, i64 %M) #0 {
13+
entry:
14+
br label %outer.header
15+
16+
outer.header:
17+
%outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
18+
%gep.indices = getelementptr inbounds i32, ptr %indices, i64 %outer.iv
19+
br label %inner.body
20+
21+
inner.body:
22+
%iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner.body ]
23+
%l.idx = load i32, ptr %gep.indices, align 4
24+
%idxprom1 = zext i32 %l.idx to i64
25+
%gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
26+
%l.bucket = load i32, ptr %gep.bucket, align 4
27+
%inc = add nsw i32 %l.bucket, 1
28+
store i32 %inc, ptr %gep.bucket, align 4
29+
%iv.next = add nuw nsw i64 %iv, 1
30+
%exitcond = icmp eq i64 %iv.next, %N
31+
br i1 %exitcond, label %outer.latch, label %inner.body
32+
33+
outer.latch:
34+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
35+
%outer.exitcond = icmp eq i64 %outer.iv.next, %M
36+
br i1 %outer.exitcond, label %outer.exit, label %outer.header
37+
38+
outer.exit:
39+
ret void
40+
}
41+
42+
attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }

0 commit comments

Comments
 (0)