Skip to content

Commit 20fe328

Browse files
committed
Add an all-in-one histogram intrinsic, along with lowering for AArch64
1 parent df311a2 commit 20fe328

File tree

11 files changed

+251
-0
lines changed

11 files changed

+251
-0
lines changed

llvm/include/llvm/CodeGen/ISDOpcodes.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,6 +1402,11 @@ enum NodeType {
14021402
// which is later translated to an implicit use in the MIR.
14031403
CONVERGENCECTRL_GLUE,
14041404

1405+
// Experimental vector histogram intrinsic
1406+
// Operands: Input Chain, Inc, Mask, Base, Index, Scale, ID
1407+
// Output: Output Chain
1408+
EXPERIMENTAL_HISTOGRAM,
1409+
14051410
/// BUILTIN_OP_END - This must be the last enum value in this list.
14061411
/// The target-specific pre-isel opcode values start here.
14071412
BUILTIN_OP_END

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1526,6 +1526,9 @@ class SelectionDAG {
15261526
ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
15271527
ISD::MemIndexType IndexType,
15281528
bool IsTruncating = false);
1529+
SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl,
1530+
ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
1531+
ISD::MemIndexType IndexType);
15291532

15301533
SDValue getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT,
15311534
MachineMemOperand *MMO);

llvm/include/llvm/CodeGen/SelectionDAGNodes.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,7 @@ BEGIN_TWO_BYTE_PACK()
542542
friend class MaskedLoadStoreSDNode;
543543
friend class MaskedGatherScatterSDNode;
544544
friend class VPGatherScatterSDNode;
545+
friend class MaskedHistogramSDNode;
545546

546547
uint16_t : NumMemSDNodeBits;
547548

@@ -564,6 +565,7 @@ BEGIN_TWO_BYTE_PACK()
564565
friend class MaskedLoadSDNode;
565566
friend class MaskedGatherSDNode;
566567
friend class VPGatherSDNode;
568+
friend class MaskedHistogramSDNode;
567569

568570
uint16_t : NumLSBaseSDNodeBits;
569571

@@ -1420,6 +1422,7 @@ class MemSDNode : public SDNode {
14201422
return getOperand(2);
14211423
case ISD::MGATHER:
14221424
case ISD::MSCATTER:
1425+
case ISD::EXPERIMENTAL_HISTOGRAM:
14231426
return getOperand(3);
14241427
default:
14251428
return getOperand(1);
@@ -1468,6 +1471,7 @@ class MemSDNode : public SDNode {
14681471
case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
14691472
case ISD::GET_FPENV_MEM:
14701473
case ISD::SET_FPENV_MEM:
1474+
case ISD::EXPERIMENTAL_HISTOGRAM:
14711475
return true;
14721476
default:
14731477
return N->isMemIntrinsic() || N->isTargetMemoryOpcode();
@@ -2953,6 +2957,33 @@ class MaskedScatterSDNode : public MaskedGatherScatterSDNode {
29532957
}
29542958
};
29552959

2960+
class MaskedHistogramSDNode : public MemSDNode {
2961+
public:
2962+
friend class SelectionDAG;
2963+
2964+
MaskedHistogramSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
2965+
EVT MemVT, MachineMemOperand *MMO,
2966+
ISD::MemIndexType IndexType)
2967+
: MemSDNode(ISD::EXPERIMENTAL_HISTOGRAM, Order, DL, VTs, MemVT, MMO) {
2968+
LSBaseSDNodeBits.AddressingMode = IndexType;
2969+
}
2970+
2971+
ISD::MemIndexType getIndexType() const {
2972+
return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
2973+
}
2974+
2975+
const SDValue &getBasePtr() const { return getOperand(3); }
2976+
const SDValue &getIndex() const { return getOperand(4); }
2977+
const SDValue &getMask() const { return getOperand(2); }
2978+
const SDValue &getScale() const { return getOperand(5); }
2979+
const SDValue &getInc() const { return getOperand(1); }
2980+
const SDValue &getIntID() const { return getOperand(6); }
2981+
2982+
static bool classof(const SDNode *N) {
2983+
return N->getOpcode() == ISD::EXPERIMENTAL_HISTOGRAM;
2984+
}
2985+
};
2986+
29562987
class FPStateAccessSDNode : public MemSDNode {
29572988
public:
29582989
friend class SelectionDAG;

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1856,6 +1856,13 @@ def int_experimental_vp_strided_load : DefaultAttrsIntrinsic<[llvm_anyvector_ty
18561856
llvm_i32_ty],
18571857
[ NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>;
18581858

1859+
// Experimental histogram
1860+
def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
1861+
[ llvm_anyvector_ty, // Vector of pointers
1862+
llvm_anyint_ty, // Increment
1863+
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
1864+
[]>;
1865+
18591866
// Operators
18601867
let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
18611868
// Integer arithmetic

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9614,6 +9614,44 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl,
96149614
return V;
96159615
}
96169616

9617+
SDValue SelectionDAG::getMaskedHistogram(SDVTList VTs, EVT MemVT,
9618+
const SDLoc &dl, ArrayRef<SDValue> Ops,
9619+
MachineMemOperand *MMO,
9620+
ISD::MemIndexType IndexType) {
9621+
assert(Ops.size() == 7 && "Incompatible number of operands");
9622+
9623+
FoldingSetNodeID ID;
9624+
AddNodeIDNode(ID, ISD::EXPERIMENTAL_HISTOGRAM, VTs, Ops);
9625+
ID.AddInteger(MemVT.getRawBits());
9626+
ID.AddInteger(getSyntheticNodeSubclassData<MaskedHistogramSDNode>(
9627+
dl.getIROrder(), VTs, MemVT, MMO, IndexType));
9628+
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
9629+
ID.AddInteger(MMO->getFlags());
9630+
void *IP = nullptr;
9631+
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
9632+
cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
9633+
return SDValue(E, 0);
9634+
}
9635+
9636+
auto *N = newSDNode<MaskedHistogramSDNode>(dl.getIROrder(), dl.getDebugLoc(),
9637+
VTs, MemVT, MMO, IndexType);
9638+
createOperands(N, Ops);
9639+
9640+
assert(N->getMask().getValueType().getVectorElementCount() ==
9641+
N->getIndex().getValueType().getVectorElementCount() &&
9642+
"Vector width mismatch between mask and data");
9643+
assert(isa<ConstantSDNode>(N->getScale()) &&
9644+
N->getScale()->getAsAPIntVal().isPowerOf2() &&
9645+
"Scale should be a constant power of 2");
9646+
assert(N->getInc().getValueType().isInteger() && "Non integer update value");
9647+
9648+
CSEMap.InsertNode(N, IP);
9649+
InsertNode(N);
9650+
SDValue V(N, 0);
9651+
NewSDValueDbgMsg(V, "Creating new node: ", this);
9652+
return V;
9653+
}
9654+
96179655
SDValue SelectionDAG::getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr,
96189656
EVT MemVT, MachineMemOperand *MMO) {
96199657
assert(Chain.getValueType() == MVT::Other && "Invalid chain type");

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6281,6 +6281,64 @@ void SelectionDAGBuilder::visitConvergenceControl(const CallInst &I,
62816281
}
62826282
}
62836283

6284+
void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
6285+
unsigned IntrinsicID) {
6286+
// For now, we're only lowering an 'add' histogram.
6287+
// We can add others later, e.g. saturating adds, min/max.
6288+
assert(IntrinsicID == Intrinsic::experimental_vector_histogram_add &&
6289+
"Tried to lower unsupported histogram type");
6290+
SDLoc sdl = getCurSDLoc();
6291+
Value *Ptr = I.getOperand(0);
6292+
SDValue Inc = getValue(I.getOperand(1));
6293+
SDValue Mask = getValue(I.getOperand(2));
6294+
6295+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6296+
DataLayout TargetDL = DAG.getDataLayout();
6297+
EVT VT = Inc.getValueType();
6298+
Align Alignment = DAG.getEVTAlign(VT);
6299+
6300+
const MDNode *Ranges = getRangeMetadata(I);
6301+
6302+
SDValue Root = DAG.getRoot();
6303+
SDValue Base;
6304+
SDValue Index;
6305+
ISD::MemIndexType IndexType;
6306+
SDValue Scale;
6307+
bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
6308+
I.getParent(), VT.getScalarStoreSize());
6309+
6310+
unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
6311+
6312+
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
6313+
MachinePointerInfo(AS),
6314+
MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
6315+
MemoryLocation::UnknownSize, Alignment, I.getAAMetadata(), Ranges);
6316+
6317+
if (!UniformBase) {
6318+
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
6319+
Index = getValue(Ptr);
6320+
IndexType = ISD::SIGNED_SCALED;
6321+
Scale =
6322+
DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
6323+
}
6324+
6325+
EVT IdxVT = Index.getValueType();
6326+
EVT EltTy = IdxVT.getVectorElementType();
6327+
if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
6328+
EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy);
6329+
Index = DAG.getNode(ISD::SIGN_EXTEND, sdl, NewIdxVT, Index);
6330+
}
6331+
6332+
SDValue ID = DAG.getTargetConstant(IntrinsicID, sdl, MVT::i32);
6333+
6334+
SDValue Ops[] = {Root, Inc, Mask, Base, Index, Scale, ID};
6335+
SDValue Histogram = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), VT, sdl,
6336+
Ops, MMO, IndexType);
6337+
6338+
setValue(&I, Histogram);
6339+
DAG.setRoot(Histogram);
6340+
}
6341+
62846342
/// Lower the call to the specified intrinsic function.
62856343
void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
62866344
unsigned Intrinsic) {
@@ -7949,6 +8007,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
79498007
case Intrinsic::experimental_convergence_entry:
79508008
case Intrinsic::experimental_convergence_loop:
79518009
visitConvergenceControl(I, Intrinsic);
8010+
return;
8011+
case Intrinsic::experimental_vector_histogram_add: {
8012+
visitVectorHistogram(I, Intrinsic);
8013+
return;
8014+
}
79528015
}
79538016
}
79548017

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,7 @@ class SelectionDAGBuilder {
624624
void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
625625
void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
626626
void visitConvergenceControl(const CallInst &I, unsigned Intrinsic);
627+
void visitVectorHistogram(const CallInst &I, unsigned IntrinsicID);
627628
void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT,
628629
const SmallVectorImpl<SDValue> &OpValues);
629630
void visitVPStore(const VPIntrinsic &VPIntrin,

llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
529529
case ISD::PATCHPOINT:
530530
return "patchpoint";
531531

532+
case ISD::EXPERIMENTAL_HISTOGRAM:
533+
return "histogram";
534+
532535
// Vector Predication
533536
#define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \
534537
case ISD::SDID: \

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1606,6 +1606,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
16061606
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
16071607
}
16081608

1609+
// Histcnt is SVE2 only
1610+
if (Subtarget->hasSVE2() && Subtarget->isSVEAvailable())
1611+
setOperationAction(ISD::EXPERIMENTAL_HISTOGRAM, MVT::Other, Custom);
1612+
16091613
// NOTE: Currently this has to happen after computeRegisterProperties rather
16101614
// than the preferred option of combining it with the addRegisterClass call.
16111615
if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -6730,6 +6734,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
67306734
return LowerFunnelShift(Op, DAG);
67316735
case ISD::FLDEXP:
67326736
return LowerFLDEXP(Op, DAG);
6737+
case ISD::EXPERIMENTAL_HISTOGRAM:
6738+
return LowerVECTOR_HISTOGRAM(Op, DAG);
67336739
}
67346740
}
67356741

@@ -27249,6 +27255,62 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
2724927255
return DAG.getMergeValues({Lo, Hi}, DL);
2725027256
}
2725127257

27258+
SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
27259+
SelectionDAG &DAG) const {
27260+
// FIXME: Maybe share some code with LowerMGather/Scatter?
27261+
MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
27262+
SDLoc DL(HG);
27263+
SDValue Chain = HG->getOperand(0);
27264+
SDValue Inc = HG->getInc();
27265+
SDValue Mask = HG->getMask();
27266+
SDValue Ptr = HG->getBasePtr();
27267+
SDValue Index = HG->getIndex();
27268+
SDValue Scale = HG->getScale();
27269+
SDValue IntID = HG->getIntID();
27270+
27271+
// The Intrinsic ID determines the type of update operation.
27272+
ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
27273+
// Right now, we only support 'add' as an update.
27274+
assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
27275+
"Unexpected histogram update operation");
27276+
27277+
EVT IncVT = Inc.getValueType();
27278+
EVT IndexVT = Index.getValueType();
27279+
EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT,
27280+
IndexVT.getVectorElementCount());
27281+
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27282+
SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero);
27283+
SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc);
27284+
SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
27285+
27286+
// Set the MMO to load only, rather than load|store.
27287+
MachineMemOperand *GMMO = HG->getMemOperand();
27288+
GMMO->setFlags(MachineMemOperand::MOLoad);
27289+
ISD::MemIndexType IndexType = HG->getIndexType();
27290+
SDValue Gather =
27291+
DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL, Ops,
27292+
HG->getMemOperand(), IndexType, ISD::NON_EXTLOAD);
27293+
27294+
SDValue GChain = Gather.getValue(1);
27295+
27296+
// Perform the histcnt, multiply by inc, add to bucket data.
27297+
SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT);
27298+
SDValue HistCnt =
27299+
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
27300+
SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat);
27301+
SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul);
27302+
27303+
// Create a new MMO for the scatter.
27304+
MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
27305+
GMMO->getPointerInfo(), MachineMemOperand::MOStore, GMMO->getSize(),
27306+
GMMO->getAlign(), GMMO->getAAInfo());
27307+
27308+
SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
27309+
SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
27310+
ScatterOps, SMMO, IndexType, false);
27311+
return Scatter;
27312+
}
27313+
2725227314
SDValue
2725327315
AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
2725427316
SelectionDAG &DAG) const {

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,6 +1149,7 @@ class AArch64TargetLowering : public TargetLowering {
11491149
SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
11501150
SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
11511151
SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
1152+
SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const;
11521153
SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
11531154
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
11541155
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
3+
4+
define void @histogram_i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0 {
5+
; CHECK-LABEL: histogram_i64:
6+
; CHECK: // %bb.0:
7+
; CHECK-NEXT: histcnt z1.d, p0/z, z0.d, z0.d
8+
; CHECK-NEXT: mov z3.d, x0
9+
; CHECK-NEXT: ld1d { z2.d }, p0/z, [z0.d]
10+
; CHECK-NEXT: ptrue p1.d
11+
; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d
12+
; CHECK-NEXT: st1d { z1.d }, p0, [z0.d]
13+
; CHECK-NEXT: ret
14+
call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask)
15+
ret void
16+
}
17+
18+
;; FIXME: We maybe need some dagcombines here? We're multiplying the output of the histcnt
19+
;; by 1, so we should be able to remove that and directly add the histcnt to the
20+
;; current bucket data.
21+
define void @histogram_i32_literal(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
22+
; CHECK-LABEL: histogram_i32_literal:
23+
; CHECK: // %bb.0:
24+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
25+
; CHECK-NEXT: mov z3.s, #1 // =0x1
26+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
27+
; CHECK-NEXT: ptrue p1.s
28+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
29+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
30+
; CHECK-NEXT: ret
31+
32+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %indices
33+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
34+
ret void
35+
}
36+
37+
attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }

0 commit comments

Comments
 (0)