-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Handle new atomicrmw metadata for fadd case #96760
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis is the most complex atomicrmw support case. Note we don't have Continue respecting amdgpu-unsafe-fp-atomics until it's eventual removal. Patch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96760.diff 37 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 11ebfe7511f7b..f9b5aea144440 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16037,26 +16037,15 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
SNaN, Depth);
}
-#if 0
-// FIXME: This should be checked before unsafe fp atomics are enabled
-// Global FP atomic instructions have a hardcoded FP mode and do not support
-// FP32 denormals, and only support v2f16 denormals.
-static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
- const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
- auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
- if (&Flt == &APFloat::IEEEsingle())
- return DenormMode == DenormalMode::getPreserveSign();
- return DenormMode == DenormalMode::getIEEE();
-}
-#endif
+// On older subtargets, global FP atomic instructions have a hardcoded FP mode
+// and do not support FP32 denormals, and only support v2f16/f64 denormals.
+static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
+ if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
+ return true;
-// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
-// floating point atomic instructions. May generate more efficient code,
-// but may not respect rounding and denormal modes, and may give incorrect
-// results for certain memory destinations.
-bool unsafeFPAtomicsDisabled(Function *F) {
- return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
- "true";
+ const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
+ auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
+ return DenormMode == DenormalMode::getPreserveSign();
}
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16185,75 +16174,74 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::CmpXChg;
}
- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
- AS != AMDGPUAS::BUFFER_FAT_POINTER)
- return AtomicExpansionKind::CmpXChg;
-
- if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
- return AtomicExpansionKind::None;
-
- if (AS == AMDGPUAS::FLAT_ADDRESS) {
- // gfx940, gfx12
- // FIXME: Needs to account for no fine-grained memory
- if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
- return AtomicExpansionKind::None;
- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
- // gfx90a, gfx940, gfx12
- // FIXME: Needs to account for no fine-grained memory
- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
- return AtomicExpansionKind::None;
-
- // gfx940, gfx12
- // FIXME: Needs to account for no fine-grained memory
- if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
- return AtomicExpansionKind::None;
- } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
- // gfx90a, gfx940, gfx12
- // FIXME: Needs to account for no fine-grained memory
- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
- return AtomicExpansionKind::None;
-
- // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
- // buffer. gfx12 does have the buffer version.
- if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
- return AtomicExpansionKind::None;
- }
-
- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
- return AtomicExpansionKind::CmpXChg;
-
- // Always expand system scope fp atomics.
- if (HasSystemScope)
+ // LDS atomics respect the denormal mode from the mode register.
+ //
+ // Traditionally f32 global/buffer memory atomics would unconditionally
+ // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
+ // flush.
+ //
+ // On targets with flat atomic fadd, denormals would flush depending on
+ // whether the target address resides in LDS or global memory. We consider
+ // this flat-maybe-flush as will-flush.
+ if (Ty->isFloatTy() &&
+ !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
+ !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
return AtomicExpansionKind::CmpXChg;
- // global and flat atomic fadd f64: gfx90a, gfx940.
- if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
-
- if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- }
+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ // gfx940, gfx12
+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
+ // gfx90a, gfx940, gfx12
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
+ return AtomicExpansionKind::None;
+
+ // gfx940, gfx12
+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+ // gfx90a, gfx940, gfx12
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
+ return AtomicExpansionKind::None;
+
+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
+ // buffer. gfx12 does have the buffer version.
+ if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ }
- // flat atomic fadd f32: gfx940, gfx11+.
- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
- if (Subtarget->hasFlatAtomicFaddF32Inst())
+ // global and flat atomic fadd f64: gfx90a, gfx940.
+ if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
- // If it is in flat address space, and the type is float, we will try to
- // expand it, if the target supports global and lds atomic fadd. The
- // reason we need that is, in the expansion, we emit the check of address
- // space. If it is in global address space, we emit the global atomic
- // fadd; if it is in shared address space, we emit the LDS atomic fadd.
- if (Subtarget->hasLDSFPAtomicAddF32()) {
+ if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
- return AtomicExpansionKind::Expand;
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
- return AtomicExpansionKind::Expand;
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ }
+
+ // flat atomic fadd f32: gfx940, gfx11+.
+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
+ if (Subtarget->hasFlatAtomicFaddF32Inst())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+
+ // If it is in flat address space, and the type is float, we will try to
+ // expand it, if the target supports global and lds atomic fadd. The
+ // reason we need that is, in the expansion, we emit the check of
+ // address space. If it is in global address space, we emit the global
+ // atomic fadd; if it is in shared address space, we emit the LDS atomic
+ // fadd.
+ if (Subtarget->hasLDSFPAtomicAddF32()) {
+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
+ return AtomicExpansionKind::Expand;
+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
+ return AtomicExpansionKind::Expand;
+ }
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index aa9ebb9226cdd..7ce1c17b7ccfd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -79,7 +79,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
; GFX11-NEXT: S_ENDPGM 0
- %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
+ %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -107,10 +107,12 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data
; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
+ %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret float %ret
}
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr, float)
attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
index 68d8e3d747b86..869f073f3b1b9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
@@ -55,7 +55,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
+ %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -78,10 +78,12 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
- %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
+ %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret double %ret
}
declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double)
attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index 632dbd45279fb..710d48be037e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -34,7 +34,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
- %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
+ %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -50,7 +50,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
- %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
+ %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -75,7 +75,7 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
- %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
+ %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret float %ret
}
@@ -235,3 +235,5 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
}
attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 66b22bedaf072..429b045e31c12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1095,32 +1095,20 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB39_3
+; GFX90A-NEXT: s_cbranch_execz .LBB39_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
-; GFX90A-NEXT: .LBB39_3:
+; GFX90A-NEXT: .LBB39_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat:
@@ -1146,7 +1134,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: .LBB39_2:
; GFX940-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1196,7 +1184,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX940-NEXT: .LBB40_2:
; GFX940-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1209,32 +1197,20 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB41_3
+; GFX90A-NEXT: s_cbranch_execz .LBB41_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB41_2
-; GFX90A-NEXT: .LBB41_3:
+; GFX90A-NEXT: .LBB41_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system:
@@ -1260,7 +1236,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-NEXT: .LBB41_2:
; GFX940-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1310,7 +1286,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX940-NEXT: .LBB42_2:
; GFX940-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1337,26 +1313,13 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat:
@@ -136...
[truncated]
|
7536693
to
ea8cd50
Compare
50d27a4
to
bfa6075
Compare
ea8cd50
to
11840af
Compare
bfa6075
to
9df089b
Compare
11840af
to
917be5c
Compare
9df089b
to
581f9cb
Compare
917be5c
to
d0d6336
Compare
79c26fe
to
f5b8ff2
Compare
f1a1777
to
3707bac
Compare
f5b8ff2
to
d4d5a69
Compare
3707bac
to
d942afd
Compare
d4d5a69
to
f00129c
Compare
d942afd
to
13f108d
Compare
ping |
f00129c
to
a2231fa
Compare
13f108d
to
11308ba
Compare
a2231fa
to
2c9a7e3
Compare
11308ba
to
7d5e995
Compare
ping |
2c9a7e3
to
478d3cb
Compare
7d5e995
to
5db155c
Compare
This is the most complex atomicrmw support case. Note we don't have accurate remarks for all of the cases, which I'm planning on fixing in a later change with more precise wording. Continue respecting amdgpu-unsafe-fp-atomics until it's eventual removal. Also seems to fix a few cases not interpreting amdgpu-unsafe-fp-atomics appropriately aggressively.
5db155c
to
10f7365
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some nits but generally LGTM
bool unsafeFPAtomicsDisabled(Function *F) { | ||
return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() != | ||
"true"; | ||
// TODO: Remove this. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you elaborate (in a comment) on why/when this should be removed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The patch to remove this is already waiting for review further up the stack
} else { | ||
// gfx908 | ||
if (RMW->use_empty() && | ||
Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && | ||
isHalf2(Ty)) | ||
return ReportUnsafeHWInst(AtomicExpansionKind::None); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
else if
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It will just be undone later up the stack
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) | ||
return AtomicExpansionKind::Expand; | ||
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: use ||
and only one if
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had that at one point but decided it's less readable as all the conditions get added
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) | ||
return ReportUnsafeHWInst(AtomicExpansionKind::None); | ||
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. | ||
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: use ||
and only one if
This is the most complex atomicrmw support case. Note we don't have
accurate remarks for all of the cases, which I'm planning on fixing
in a later change with more precise wording.
Continue respecting amdgpu-unsafe-fp-atomics until it's eventual removal.
Also seems to fix a few cases not interpreting amdgpu-unsafe-fp-atomics
appropriately aaggressively.