Skip to content

Commit 756166e

Browse files
authored
[AMDGPU] Improve detection of non-null addrspacecast operands (#82311)
Use IR analysis to infer when an addrspacecast operand is nonnull, then lower it to an intrinsic that the DAG can use to skip the null check. I did this using an intrinsic as it's non-intrusive. An alternative would have been to allow something like `!nonnull` on `addrspacecast` then lower that to a custom opcode (or add an operand to the addrspacecast MIR/DAG opcodes), but it's a lot of boilerplate for just one target's use case IMO. I'm hoping that when we switch to GISel that we can move all this logic to the MIR level without losing info, but currently the DAG doesn't see enough so we need to act in CGP. Fixes: SWDEV-316445
1 parent f28c4b4 commit 756166e

8 files changed

+498
-17
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3196,4 +3196,11 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
31963196
[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
31973197
[IntrNoMem, IntrSpeculatable]
31983198
>;
3199+
3200+
/// Emit an addrspacecast without null pointer checking.
3201+
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
3202+
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
3203+
[llvm_anyptr_ty], [llvm_anyptr_ty],
3204+
[IntrNoMem, IntrSpeculatable]
3205+
>;
31993206
}

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ class AMDGPUCodeGenPrepareImpl
9999
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
100100
public:
101101
const GCNSubtarget *ST = nullptr;
102+
const AMDGPUTargetMachine *TM = nullptr;
102103
const TargetLibraryInfo *TLInfo = nullptr;
103104
AssumptionCache *AC = nullptr;
104105
DominatorTree *DT = nullptr;
@@ -310,6 +311,7 @@ class AMDGPUCodeGenPrepareImpl
310311
bool visitICmpInst(ICmpInst &I);
311312
bool visitSelectInst(SelectInst &I);
312313
bool visitPHINode(PHINode &I);
314+
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
313315

314316
bool visitIntrinsicInst(IntrinsicInst &I);
315317
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
@@ -2013,6 +2015,75 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
20132015
return true;
20142016
}
20152017

2018+
/// \param V Value to check
2019+
/// \param DL DataLayout
2020+
/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2021+
/// \param AS Target Address Space
2022+
/// \return true if \p V cannot be the null value of \p AS, false otherwise.
2023+
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
2024+
const AMDGPUTargetMachine &TM, unsigned AS) {
2025+
// Pointer cannot be null if it's a block address, GV or alloca.
2026+
// NOTE: We don't support extern_weak, but if we did, we'd need to check for
2027+
// it as the symbol could be null in such cases.
2028+
if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2029+
return true;
2030+
2031+
// Check nonnull arguments.
2032+
if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2033+
return true;
2034+
2035+
// TODO: Calls that return nonnull?
2036+
2037+
// For all other things, use KnownBits.
2038+
// We either use 0 or all bits set to indicate null, so check whether the
2039+
// value can be zero or all ones.
2040+
//
2041+
// TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2042+
// address spaces have non-zero null values.
2043+
auto SrcPtrKB = computeKnownBits(V, DL).trunc(DL.getPointerSizeInBits(AS));
2044+
const auto NullVal = TM.getNullPointerValue(AS);
2045+
assert((NullVal == 0 || NullVal == -1) &&
2046+
"don't know how to check for this null value!");
2047+
return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2048+
}
2049+
2050+
bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2051+
// Intrinsic doesn't support vectors, also it seems that it's often difficult
2052+
// to prove that a vector cannot have any nulls in it so it's unclear if it's
2053+
// worth supporting.
2054+
if (I.getType()->isVectorTy())
2055+
return false;
2056+
2057+
// Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2058+
// This is only worthwhile for casts from/to priv/local to flat.
2059+
const unsigned SrcAS = I.getSrcAddressSpace();
2060+
const unsigned DstAS = I.getDestAddressSpace();
2061+
2062+
bool CanLower = false;
2063+
if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2064+
CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2065+
DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2066+
else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2067+
CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2068+
SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2069+
if (!CanLower)
2070+
return false;
2071+
2072+
SmallVector<const Value *, 4> WorkList;
2073+
getUnderlyingObjects(I.getOperand(0), WorkList);
2074+
if (!all_of(WorkList, [&](const Value *V) {
2075+
return isPtrKnownNeverNull(V, *DL, *TM, SrcAS);
2076+
}))
2077+
return false;
2078+
2079+
IRBuilder<> B(&I);
2080+
auto *Intrin = B.CreateIntrinsic(
2081+
I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2082+
I.replaceAllUsesWith(Intrin);
2083+
I.eraseFromParent();
2084+
return true;
2085+
}
2086+
20162087
bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
20172088
switch (I.getIntrinsicID()) {
20182089
case Intrinsic::bitreverse:
@@ -2196,6 +2267,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
21962267
return false;
21972268

21982269
const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2270+
Impl.TM = &TM;
21992271
Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
22002272
Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
22012273
Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -2214,6 +2286,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
22142286
AMDGPUCodeGenPrepareImpl Impl;
22152287
Impl.Mod = F.getParent();
22162288
Impl.DL = &Impl.Mod->getDataLayout();
2289+
Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
22172290
Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
22182291
Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
22192292
Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2247,10 +2247,16 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
22472247
MachineIRBuilder &B) const {
22482248
MachineFunction &MF = B.getMF();
22492249

2250+
// MI can either be a G_ADDRSPACE_CAST or a
2251+
// G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2252+
assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2253+
(isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2254+
Intrinsic::amdgcn_addrspacecast_nonnull));
2255+
22502256
const LLT S32 = LLT::scalar(32);
22512257
Register Dst = MI.getOperand(0).getReg();
2252-
Register Src = MI.getOperand(1).getReg();
2253-
2258+
Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2259+
: MI.getOperand(1).getReg();
22542260
LLT DstTy = MRI.getType(Dst);
22552261
LLT SrcTy = MRI.getType(Src);
22562262
unsigned DestAS = DstTy.getAddressSpace();
@@ -2271,7 +2277,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
22712277
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
22722278
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
22732279
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2274-
if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2280+
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2281+
// G_ADDRSPACE_CAST we need to guess.
2282+
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
22752283
// Extract low 32-bits of the pointer.
22762284
B.buildExtract(Dst, Src, 0);
22772285
MI.eraseFromParent();
@@ -2308,7 +2316,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
23082316
// avoid the ptrtoint?
23092317
auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
23102318

2311-
if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2319+
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2320+
// G_ADDRSPACE_CAST we need to guess.
2321+
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
23122322
B.buildCopy(Dst, BuildPtr);
23132323
MI.eraseFromParent();
23142324
return true;
@@ -7020,6 +7030,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
70207030

70217031
return false;
70227032
}
7033+
case Intrinsic::amdgcn_addrspacecast_nonnull:
7034+
return legalizeAddrSpaceCast(MI, MRI, B);
70237035
case Intrinsic::amdgcn_make_buffer_rsrc:
70247036
return legalizePointerAsRsrcIntrin(MI, MRI, B);
70257037
case Intrinsic::amdgcn_kernarg_segment_ptr:

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1415,6 +1415,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14151415
}
14161416
}
14171417

1418+
void SITargetLowering::CollectTargetIntrinsicOperands(
1419+
const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1420+
switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1421+
case Intrinsic::amdgcn_addrspacecast_nonnull: {
1422+
// The DAG's ValueType loses the addrspaces.
1423+
// Add them as 2 extra Constant operands "from" and "to".
1424+
unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1425+
unsigned DstAS = I.getType()->getPointerAddressSpace();
1426+
Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1427+
Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1428+
break;
1429+
}
1430+
default:
1431+
break;
1432+
}
1433+
}
1434+
14181435
bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
14191436
SmallVectorImpl<Value*> &Ops,
14201437
Type *&AccessTy) const {
@@ -6635,24 +6652,36 @@ static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
66356652
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
66366653
SelectionDAG &DAG) const {
66376654
SDLoc SL(Op);
6638-
const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
6639-
6640-
SDValue Src = ASC->getOperand(0);
6641-
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6642-
unsigned SrcAS = ASC->getSrcAddressSpace();
66436655

66446656
const AMDGPUTargetMachine &TM =
66456657
static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
66466658

6659+
unsigned DestAS, SrcAS;
6660+
SDValue Src;
6661+
bool IsNonNull = false;
6662+
if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
6663+
SrcAS = ASC->getSrcAddressSpace();
6664+
Src = ASC->getOperand(0);
6665+
DestAS = ASC->getDestAddressSpace();
6666+
} else {
6667+
assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6668+
Op.getConstantOperandVal(0) ==
6669+
Intrinsic::amdgcn_addrspacecast_nonnull);
6670+
Src = Op->getOperand(1);
6671+
SrcAS = Op->getConstantOperandVal(2);
6672+
DestAS = Op->getConstantOperandVal(3);
6673+
IsNonNull = true;
6674+
}
6675+
6676+
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6677+
66476678
// flat -> local/private
66486679
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
6649-
unsigned DestAS = ASC->getDestAddressSpace();
6650-
66516680
if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
66526681
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
66536682
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
66546683

6655-
if (isKnownNonNull(Src, DAG, TM, SrcAS))
6684+
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
66566685
return Ptr;
66576686

66586687
unsigned NullVal = TM.getNullPointerValue(DestAS);
@@ -6665,16 +6694,16 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
66656694
}
66666695

66676696
// local/private -> flat
6668-
if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
6697+
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
66696698
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
66706699
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
66716700

6672-
SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
6701+
SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
66736702
SDValue CvtPtr =
66746703
DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
66756704
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
66766705

6677-
if (isKnownNonNull(Src, DAG, TM, SrcAS))
6706+
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
66786707
return CvtPtr;
66796708

66806709
unsigned NullVal = TM.getNullPointerValue(SrcAS);
@@ -6697,7 +6726,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
66976726
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
66986727
}
66996728

6700-
if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6729+
if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
67016730
Src.getValueType() == MVT::i64)
67026731
return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
67036732

@@ -6708,7 +6737,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
67086737
MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
67096738
DAG.getContext()->diagnose(InvalidAddrSpaceCast);
67106739

6711-
return DAG.getUNDEF(ASC->getValueType(0));
6740+
return DAG.getUNDEF(Op->getValueType(0));
67126741
}
67136742

67146743
// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
@@ -8325,6 +8354,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
83258354
Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
83268355
IndexKeyi32, Op.getOperand(7)});
83278356
}
8357+
case Intrinsic::amdgcn_addrspacecast_nonnull:
8358+
return lowerADDRSPACECAST(Op, DAG);
83288359
default:
83298360
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
83308361
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
305305
MachineFunction &MF,
306306
unsigned IntrinsicID) const override;
307307

308+
void CollectTargetIntrinsicOperands(const CallInst &I,
309+
SmallVectorImpl<SDValue> &Ops,
310+
SelectionDAG &DAG) const override;
311+
308312
bool getAddrModeArguments(IntrinsicInst * /*I*/,
309313
SmallVectorImpl<Value*> &/*Ops*/,
310314
Type *&/*AccessTy*/) const override;
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2+
; RUN: opt -mtriple=amdgcn-- -amdgpu-codegenprepare -S < %s | FileCheck -check-prefix=OPT %s
3+
4+
; Check that CGP doesn't try to create a amdgcn.addrspace.nonnull of vector, as that's not supported.
5+
6+
define <4 x ptr> @vec_of_local_to_flat_nonnull_arg() {
7+
; OPT-LABEL: define <4 x ptr> @vec_of_local_to_flat_nonnull_arg() {
8+
; OPT-NEXT: [[X:%.*]] = addrspacecast <4 x ptr addrspace(3)> zeroinitializer to <4 x ptr>
9+
; OPT-NEXT: ret <4 x ptr> [[X]]
10+
;
11+
%x = addrspacecast <4 x ptr addrspace(3)> zeroinitializer to <4 x ptr>
12+
ret <4 x ptr> %x
13+
}

0 commit comments

Comments
 (0)