Skip to content

Commit dfda9c5

Browse files
authored
AMDGPU: Handle new atomicrmw metadata for fadd case (#96760)
This is the most complex atomicrmw support case. Note we don't have accurate remarks for all of the cases, which I'm planning on fixing in a later change with more precise wording. Continue respecting amdgpu-unsafe-fp-atomics until it's eventual removal. Also seems to fix a few cases not interpreting amdgpu-unsafe-fp-atomics appropriately aaggressively.
1 parent d02757c commit dfda9c5

39 files changed

+28103
-21755
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 82 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -16062,26 +16062,21 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
1606216062
SNaN, Depth);
1606316063
}
1606416064

16065-
#if 0
16066-
// FIXME: This should be checked before unsafe fp atomics are enabled
16067-
// Global FP atomic instructions have a hardcoded FP mode and do not support
16068-
// FP32 denormals, and only support v2f16 denormals.
16069-
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16065+
// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16066+
// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16067+
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16068+
if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16069+
return true;
16070+
1607016071
const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16071-
auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16072-
if (&Flt == &APFloat::IEEEsingle())
16073-
return DenormMode == DenormalMode::getPreserveSign();
16074-
return DenormMode == DenormalMode::getIEEE();
16075-
}
16076-
#endif
16072+
auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16073+
if (DenormMode == DenormalMode::getPreserveSign())
16074+
return true;
1607716075

16078-
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16079-
// floating point atomic instructions. May generate more efficient code,
16080-
// but may not respect rounding and denormal modes, and may give incorrect
16081-
// results for certain memory destinations.
16082-
bool unsafeFPAtomicsDisabled(Function *F) {
16083-
return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16084-
"true";
16076+
// TODO: Remove this.
16077+
return RMW->getFunction()
16078+
->getFnAttribute("amdgpu-unsafe-fp-atomics")
16079+
.getValueAsBool();
1608516080
}
1608616081

1608716082
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16210,82 +16205,85 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1621016205
return AtomicExpansionKind::CmpXChg;
1621116206
}
1621216207

16213-
if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16214-
AS != AMDGPUAS::BUFFER_FAT_POINTER)
16215-
return AtomicExpansionKind::CmpXChg;
16216-
16217-
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16218-
return AtomicExpansionKind::None;
16219-
16220-
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16221-
// gfx940, gfx12
16222-
// FIXME: Needs to account for no fine-grained memory
16223-
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16224-
return AtomicExpansionKind::None;
16225-
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16226-
// gfx90a, gfx940, gfx12
16227-
// FIXME: Needs to account for no fine-grained memory
16228-
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16229-
return AtomicExpansionKind::None;
16230-
16231-
// gfx940, gfx12
16232-
// FIXME: Needs to account for no fine-grained memory
16233-
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16234-
return AtomicExpansionKind::None;
16235-
} else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16236-
// gfx90a, gfx940, gfx12
16237-
// FIXME: Needs to account for no fine-grained memory
16238-
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16239-
return AtomicExpansionKind::None;
16240-
16241-
// While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16242-
// buffer. gfx12 does have the buffer version.
16243-
if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16244-
return AtomicExpansionKind::None;
16245-
}
16246-
16247-
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16248-
return AtomicExpansionKind::CmpXChg;
16249-
16250-
// Always expand system scope fp atomics.
16251-
if (HasSystemScope)
16208+
// LDS atomics respect the denormal mode from the mode register.
16209+
//
16210+
// Traditionally f32 global/buffer memory atomics would unconditionally
16211+
// flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16212+
// flush.
16213+
//
16214+
// On targets with flat atomic fadd, denormals would flush depending on
16215+
// whether the target address resides in LDS or global memory. We consider
16216+
// this flat-maybe-flush as will-flush.
16217+
if (Ty->isFloatTy() &&
16218+
!Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16219+
!atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
1625216220
return AtomicExpansionKind::CmpXChg;
1625316221

16254-
// global and flat atomic fadd f64: gfx90a, gfx940.
16255-
if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16256-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16222+
// FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16223+
// safe. The message phrasing also should be better.
16224+
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16225+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16226+
// gfx940, gfx12
16227+
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16228+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16229+
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16230+
// gfx90a, gfx940, gfx12
16231+
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16232+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
1625716233

16258-
if (AS != AMDGPUAS::FLAT_ADDRESS) {
16259-
if (Ty->isFloatTy()) {
16260-
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16261-
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16234+
// gfx940, gfx12
16235+
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
1626216236
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16263-
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16264-
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16237+
} else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16238+
// gfx90a, gfx940, gfx12
16239+
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
1626516240
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16266-
} else {
16267-
// gfx908
16268-
if (RMW->use_empty() &&
16269-
Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))
16241+
16242+
// While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16243+
// buffer. gfx12 does have the buffer version.
16244+
if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
1627016245
return ReportUnsafeHWInst(AtomicExpansionKind::None);
1627116246
}
16272-
}
1627316247

16274-
// flat atomic fadd f32: gfx940, gfx11+.
16275-
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16276-
if (Subtarget->hasFlatAtomicFaddF32Inst())
16248+
// global and flat atomic fadd f64: gfx90a, gfx940.
16249+
if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
1627716250
return ReportUnsafeHWInst(AtomicExpansionKind::None);
1627816251

16279-
// If it is in flat address space, and the type is float, we will try to
16280-
// expand it, if the target supports global and lds atomic fadd. The
16281-
// reason we need that is, in the expansion, we emit the check of address
16282-
// space. If it is in global address space, we emit the global atomic
16283-
// fadd; if it is in shared address space, we emit the LDS atomic fadd.
16284-
if (Subtarget->hasLDSFPAtomicAddF32()) {
16285-
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16286-
return AtomicExpansionKind::Expand;
16287-
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16288-
return AtomicExpansionKind::Expand;
16252+
if (AS != AMDGPUAS::FLAT_ADDRESS) {
16253+
if (Ty->isFloatTy()) {
16254+
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16255+
// gfx11+.
16256+
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16257+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16258+
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16259+
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16260+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16261+
} else {
16262+
// gfx908
16263+
if (RMW->use_empty() &&
16264+
Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
16265+
isHalf2(Ty))
16266+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16267+
}
16268+
}
16269+
16270+
// flat atomic fadd f32: gfx940, gfx11+.
16271+
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16272+
if (Subtarget->hasFlatAtomicFaddF32Inst())
16273+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16274+
16275+
// If it is in flat address space, and the type is float, we will try to
16276+
// expand it, if the target supports global and lds atomic fadd. The
16277+
// reason we need that is, in the expansion, we emit the check of
16278+
// address space. If it is in global address space, we emit the global
16279+
// atomic fadd; if it is in shared address space, we emit the LDS atomic
16280+
// fadd.
16281+
if (Subtarget->hasLDSFPAtomicAddF32()) {
16282+
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16283+
return AtomicExpansionKind::Expand;
16284+
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16285+
return AtomicExpansionKind::Expand;
16286+
}
1628916287
}
1629016288
}
1629116289

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
5757
ret float %ret
5858
}
5959

60-
define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %data) #0 {
60+
define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %data) {
6161
; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw
6262
; GFX940: bb.1 (%ir-block.0):
6363
; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -79,11 +79,11 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da
7979
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
8080
; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
8181
; GFX11-NEXT: S_ENDPGM 0
82-
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
82+
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
8383
ret void
8484
}
8585

86-
define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data) #0 {
86+
define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data) {
8787
; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw
8888
; GFX940: bb.1 (%ir-block.0):
8989
; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -107,10 +107,10 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data
107107
; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
108108
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
109109
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
110-
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
110+
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
111111
ret float %ret
112112
}
113113

114114
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr, float)
115115

116-
attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
116+
!0 = !{}

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %da
4242
ret double %ret
4343
}
4444

45-
define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) #0 {
45+
define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) {
4646
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw
4747
; GFX90A_GFX940: bb.1 (%ir-block.0):
4848
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -55,11 +55,11 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
5555
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
5656
; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
5757
; GFX90A_GFX940-NEXT: S_ENDPGM 0
58-
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
58+
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
5959
ret void
6060
}
6161

62-
define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) #0 {
62+
define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) {
6363
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw
6464
; GFX90A_GFX940: bb.1 (%ir-block.0):
6565
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -78,10 +78,10 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
7878
; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
7979
; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
8080
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
81-
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
81+
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
8282
ret double %ret
8383
}
8484

8585
declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double)
8686

87-
attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
87+
!0 = !{}

llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
3636
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3737
; GFX940-NEXT: buffer_inv sc0 sc1
3838
; GFX940-NEXT: s_endpgm
39-
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
39+
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
4040
ret void
4141
}
4242

@@ -52,7 +52,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
5252
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5353
; GFX940-NEXT: buffer_inv sc0 sc1
5454
; GFX940-NEXT: s_endpgm
55-
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
55+
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
5656
ret void
5757
}
5858

@@ -77,7 +77,7 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
7777
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7878
; GFX940-NEXT: buffer_inv sc0 sc1
7979
; GFX940-NEXT: s_setpc_b64 s[30:31]
80-
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
80+
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
8181
ret float %ret
8282
}
8383

@@ -287,3 +287,5 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
287287
}
288288

289289
attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" }
290+
291+
!0 = !{}

0 commit comments

Comments
 (0)