Skip to content

Commit f00129c

Browse files
committed
AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering
Consider the new atomic metadata when choosing to expand as cmpxchg instead.
1 parent 2ca300f commit f00129c

16 files changed

+19189
-20456
lines changed

clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ __global__ void ffp2(double *p) {
5757
// SAFE: global_atomic_cmpswap_b64
5858
// UNSAFE-LABEL: @_Z4ffp2Pd
5959
// UNSAFE: global_atomic_cmpswap_x2
60-
// UNSAFE: global_atomic_cmpswap_x2
61-
// UNSAFE: global_atomic_cmpswap_x2
60+
// UNSAFE: global_atomic_max_f64
61+
// UNSAFE: global_atomic_min_f64
6262
// UNSAFE: global_atomic_max_f64
6363
// UNSAFE: global_atomic_min_f64
6464
__atomic_fetch_sub(p, 1.0, memory_order_relaxed);
@@ -84,8 +84,8 @@ __global__ void ffp3(long double *p) {
8484
// SAFE: global_atomic_cmpswap_b64
8585
// UNSAFE-LABEL: @_Z4ffp3Pe
8686
// UNSAFE: global_atomic_cmpswap_x2
87-
// UNSAFE: global_atomic_cmpswap_x2
88-
// UNSAFE: global_atomic_cmpswap_x2
87+
// UNSAFE: global_atomic_max_f64
88+
// UNSAFE: global_atomic_min_f64
8989
// UNSAFE: global_atomic_max_f64
9090
// UNSAFE: global_atomic_min_f64
9191
__atomic_fetch_sub(p, 1.0L, memory_order_relaxed);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -16114,6 +16114,34 @@ static bool isBFloat2(Type *Ty) {
1611416114
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
1611516115
}
1611616116

16117+
/// \returns true if it's valid to emit a native instruction for \p RMW, based
16118+
/// on the properties of the target memory.
16119+
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16120+
const AtomicRMWInst *RMW,
16121+
bool HasSystemScope) {
16122+
// The remote/fine-grained access logic is different from the integer
16123+
// atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16124+
// fine-grained access does not work, even for a device local allocation.
16125+
//
16126+
// With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16127+
// allocations work.
16128+
if (HasSystemScope) {
16129+
if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
16130+
RMW->hasMetadata("amdgpu.no.remote.memory"))
16131+
return true;
16132+
} else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16133+
return true;
16134+
16135+
if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16136+
return true;
16137+
16138+
// TODO: Auto-upgrade this attribute to the metadata in function body and stop
16139+
// checking it.
16140+
return RMW->getFunction()
16141+
->getFnAttribute("amdgpu-unsafe-fp-atomics")
16142+
.getValueAsBool();
16143+
}
16144+
1611716145
TargetLowering::AtomicExpansionKind
1611816146
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1611916147
unsigned AS = RMW->getPointerAddressSpace();
@@ -16264,37 +16292,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1626416292
Type *Ty = RMW->getType();
1626516293

1626616294
// LDS float and double fmin/fmax were always supported.
16267-
if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16268-
return AtomicExpansionKind::None;
16269-
16270-
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16271-
return AtomicExpansionKind::CmpXChg;
16272-
16273-
// Always expand system scope fp atomics.
16274-
if (HasSystemScope)
16275-
return AtomicExpansionKind::CmpXChg;
16295+
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16296+
return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16297+
: AtomicExpansionKind::CmpXChg;
16298+
}
1627616299

16277-
// For flat and global cases:
16278-
// float, double in gfx7. Manual claims denormal support.
16279-
// Removed in gfx8.
16280-
// float, double restored in gfx10.
16281-
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16282-
//
16283-
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16284-
// f32.
16285-
//
16286-
// FIXME: Check scope and fine grained memory
16287-
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16288-
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16289-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16290-
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16291-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16292-
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16293-
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16294-
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16295-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16296-
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16297-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16300+
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16301+
// For flat and global cases:
16302+
// float, double in gfx7. Manual claims denormal support.
16303+
// Removed in gfx8.
16304+
// float, double restored in gfx10.
16305+
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16306+
//
16307+
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16308+
// no f32.
16309+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16310+
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16311+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16312+
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16313+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16314+
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16315+
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16316+
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16317+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16318+
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16319+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16320+
}
1629816321
}
1629916322

1630016323
return AtomicExpansionKind::CmpXChg;

0 commit comments

Comments
 (0)