Skip to content

Commit 50d27a4

Browse files
committed
AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering
Consider the new atomic metadata when choosing to expand as cmpxchg instead.
1 parent db51986 commit 50d27a4

13 files changed

+7015
-5886
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -16093,6 +16093,34 @@ static bool isBFloat2(Type *Ty) {
1609316093
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
1609416094
}
1609516095

16096+
/// \returns true if it's valid to emit a native instruction for \p RMW, based
16097+
/// on the properties of the target memory.
16098+
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16099+
const AtomicRMWInst *RMW,
16100+
bool HasSystemScope) {
16101+
// The remote/fine-grained access logic is different from the integer
16102+
// atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16103+
// fine-grained access does not work, even for a device local allocation.
16104+
//
16105+
// With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16106+
// allocations work.
16107+
if (HasSystemScope) {
16108+
if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
16109+
RMW->hasMetadata("amdgpu.no.remote.memory"))
16110+
return true;
16111+
} else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16112+
return true;
16113+
16114+
if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16115+
return true;
16116+
16117+
// TODO: Auto-upgrade this attribute to the metadata in function body and stop
16118+
// checking it.
16119+
return RMW->getFunction()
16120+
->getFnAttribute("amdgpu-unsafe-fp-atomics")
16121+
.getValueAsBool();
16122+
}
16123+
1609616124
TargetLowering::AtomicExpansionKind
1609716125
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1609816126
unsigned AS = RMW->getPointerAddressSpace();
@@ -16236,37 +16264,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1623616264
Type *Ty = RMW->getType();
1623716265

1623816266
// LDS float and double fmin/fmax were always supported.
16239-
if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16240-
return AtomicExpansionKind::None;
16241-
16242-
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16243-
return AtomicExpansionKind::CmpXChg;
16244-
16245-
// Always expand system scope fp atomics.
16246-
if (HasSystemScope)
16247-
return AtomicExpansionKind::CmpXChg;
16267+
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16268+
return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16269+
: AtomicExpansionKind::CmpXChg;
16270+
}
1624816271

16249-
// For flat and global cases:
16250-
// float, double in gfx7. Manual claims denormal support.
16251-
// Removed in gfx8.
16252-
// float, double restored in gfx10.
16253-
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16254-
//
16255-
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16256-
// f32.
16257-
//
16258-
// FIXME: Check scope and fine grained memory
16259-
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16260-
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16261-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16262-
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16263-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16264-
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16265-
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16266-
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16267-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16268-
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16269-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16272+
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16273+
// For flat and global cases:
16274+
// float, double in gfx7. Manual claims denormal support.
16275+
// Removed in gfx8.
16276+
// float, double restored in gfx10.
16277+
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16278+
//
16279+
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16280+
// no f32.
16281+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16282+
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16283+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16284+
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16285+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16286+
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16287+
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16288+
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16289+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16290+
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16291+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16292+
}
1627016293
}
1627116294

1627216295
return AtomicExpansionKind::CmpXChg;

0 commit comments

Comments
 (0)