Skip to content

Commit 478d3cb

Browse files
committed
AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering
Consider the new atomic metadata when choosing to expand as cmpxchg instead.
1 parent 2feb058 commit 478d3cb

16 files changed

+15433
-12264
lines changed

clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ __global__ void ffp2(double *p) {
8787
// UNSAFE-LABEL: @_Z4ffp2Pd
8888
// UNSAFE: global_atomic_add_f64
8989
// UNSAFE: global_atomic_cmpswap_x2
90-
// UNSAFE: global_atomic_cmpswap_x2
91-
// UNSAFE: global_atomic_cmpswap_x2
90+
// UNSAFE: global_atomic_max_f64
91+
// UNSAFE: global_atomic_min_f64
9292
// UNSAFE: global_atomic_max_f64
9393
// UNSAFE: global_atomic_min_f64
9494
__atomic_fetch_add(p, 1.0, memory_order_relaxed);
@@ -124,8 +124,8 @@ __global__ void ffp3(long double *p) {
124124
// SAFE: global_atomic_cmpswap_b64
125125
// UNSAFE-LABEL: @_Z4ffp3Pe
126126
// UNSAFE: global_atomic_cmpswap_x2
127-
// UNSAFE: global_atomic_cmpswap_x2
128-
// UNSAFE: global_atomic_cmpswap_x2
127+
// UNSAFE: global_atomic_max_f64
128+
// UNSAFE: global_atomic_min_f64
129129
// UNSAFE: global_atomic_max_f64
130130
// UNSAFE: global_atomic_min_f64
131131
__atomic_fetch_add(p, 1.0L, memory_order_relaxed);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -16118,6 +16118,34 @@ static bool isBFloat2(Type *Ty) {
1611816118
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
1611916119
}
1612016120

16121+
/// \returns true if it's valid to emit a native instruction for \p RMW, based
16122+
/// on the properties of the target memory.
16123+
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16124+
const AtomicRMWInst *RMW,
16125+
bool HasSystemScope) {
16126+
// The remote/fine-grained access logic is different from the integer
16127+
// atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16128+
// fine-grained access does not work, even for a device local allocation.
16129+
//
16130+
// With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16131+
// allocations work.
16132+
if (HasSystemScope) {
16133+
if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
16134+
RMW->hasMetadata("amdgpu.no.remote.memory"))
16135+
return true;
16136+
} else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16137+
return true;
16138+
16139+
if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16140+
return true;
16141+
16142+
// TODO: Auto-upgrade this attribute to the metadata in function body and stop
16143+
// checking it.
16144+
return RMW->getFunction()
16145+
->getFnAttribute("amdgpu-unsafe-fp-atomics")
16146+
.getValueAsBool();
16147+
}
16148+
1612116149
TargetLowering::AtomicExpansionKind
1612216150
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1612316151
unsigned AS = RMW->getPointerAddressSpace();
@@ -16268,37 +16296,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1626816296
Type *Ty = RMW->getType();
1626916297

1627016298
// LDS float and double fmin/fmax were always supported.
16271-
if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16272-
return AtomicExpansionKind::None;
16273-
16274-
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16275-
return AtomicExpansionKind::CmpXChg;
16276-
16277-
// Always expand system scope fp atomics.
16278-
if (HasSystemScope)
16279-
return AtomicExpansionKind::CmpXChg;
16299+
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16300+
return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16301+
: AtomicExpansionKind::CmpXChg;
16302+
}
1628016303

16281-
// For flat and global cases:
16282-
// float, double in gfx7. Manual claims denormal support.
16283-
// Removed in gfx8.
16284-
// float, double restored in gfx10.
16285-
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16286-
//
16287-
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16288-
// f32.
16289-
//
16290-
// FIXME: Check scope and fine grained memory
16291-
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16292-
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16293-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16294-
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16295-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16296-
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16297-
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16298-
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16299-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16300-
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16301-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16304+
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16305+
// For flat and global cases:
16306+
// float, double in gfx7. Manual claims denormal support.
16307+
// Removed in gfx8.
16308+
// float, double restored in gfx10.
16309+
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16310+
//
16311+
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16312+
// no f32.
16313+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16314+
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16315+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16316+
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16317+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16318+
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16319+
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16320+
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16321+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16322+
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16323+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16324+
}
1630216325
}
1630316326

1630416327
return AtomicExpansionKind::CmpXChg;

0 commit comments

Comments
 (0)