-
Notifications
You must be signed in to change notification settings - Fork 13.5k
AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat #95394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat #95394
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesUnlike the existing fadd cases, choose to ignore the requirement for Patch is 766.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95394.diff 12 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0234cd9088ae7..19f19eb7b7f44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1659,6 +1659,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
});
}
+ if (ST.hasAtomicBufferGlobalPkAddF16Insts())
+ Atomic.legalFor({{V2F16, GlobalPtr}});
+ if (ST.hasAtomicGlobalPkAddBF16Inst())
+ Atomic.legalFor({{V2BF16, GlobalPtr}});
+ if (ST.hasAtomicFlatPkAdd16Insts())
+ Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
+
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 818cbde592432..99e3a8084f067 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1645,6 +1645,7 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
}
let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
@@ -1669,13 +1670,16 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat
}
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
+// FIXME: These do not have signed offsets
defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>;
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
}
let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
-
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6325e75aa8f96..f59c7a73b18af 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15941,6 +15941,16 @@ static bool isHalf2OrBFloat2(Type *Ty) {
return false;
}
+static bool isHalf2(Type *Ty) {
+ FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+ return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
+}
+
+static bool isBFloat2(Type *Ty) {
+ FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+ return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
@@ -16009,10 +16019,27 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
AS != AMDGPUAS::BUFFER_FAT_POINTER)
return AtomicExpansionKind::CmpXChg;
- // TODO: gfx940 supports v2f16 and v2bf16
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
return AtomicExpansionKind::None;
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ // gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ } else {
+ // gfx90a, gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
+ return AtomicExpansionKind::None;
+
+ // gfx940, gfx12
+ // FIXME: Need to skip buffer_fat_pointer?
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ }
+
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index 4e21ef8379342..fff341b07881b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -237,24 +237,10 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p
; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
%result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -265,23 +251,10 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr,
; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
%unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -292,24 +265,10 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
%result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -320,23 +279,10 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
%unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 1e7cf0e702a03..d644ff2f8efa4 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -123,24 +123,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) {
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB0_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16:
@@ -200,26 +186,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
ret <2 x half> %result
@@ -335,23 +305,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) {
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB1_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16:
@@ -408,24 +365,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
ret void
@@ -595,43 +538,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB2_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
@@ -730,46 +640,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
ret <2 x bfloat> %result
@@ -895,42 +769,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) {
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NE...
[truncated]
|
86c3c5a
to
ba80b77
Compare
@@ -1669,13 +1670,16 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat | |||
} | |||
|
|||
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { | |||
// FIXME: These do not have signed offsets |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you just use FlatAtomicPat?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but I was planning on copying the pre-existing bug and fixing them both together later (assuming this is actually a bug and there's not some special case I haven't found documentation for)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
8eb6914
to
c516231
Compare
ba80b77
to
76cbecc
Compare
c516231
to
a307ee0
Compare
76cbecc
to
122eec1
Compare
Unlike the existing fadd cases, choose to ignore the requirement for amdgpu-unsafe-fp-atomics in case of fine-grained memory access. This is to minimize migration pain to the new atomic control metadata. This should not break any users, as the atomic intrinsics are still directly consumed, and clang does not yet produce vector FP atomicrmw.
a307ee0
to
25ea24e
Compare
This or your other PR that landed at the same time probably caused http://45.33.8.238/linux/140692/step_11.txt Please take a look and revert for now if it takes a while to fix. |
As far as AMDGPU target supports vectorization for atomic_rmw operation, allow construction of LLVM_AtomicRMWOp with 16 bit floating point values. This patch enables building of LLVM_AtomicRMWOp with fixed vectors of 16 bit fp values as operands. See also: llvm#94845, llvm#95393, llvm#95394 Signed-off-by: Ilya Veselov <[email protected]>
As far as AMDGPU target supports vectorization for atomic_rmw operation, allow construction of LLVM_AtomicRMWOp with 16 bit floating point values. This patch enables building of LLVM_AtomicRMWOp with fixed vectors of 16 bit fp values as operands. See also: llvm#94845, llvm#95393, llvm#95394 Signed-off-by: Ilya Veselov <[email protected]>
As far as AMDGPU target supports vectorization for atomic_rmw operation, allow construction of LLVM_AtomicRMWOp with 16 bit floating point values. This patch enables building of LLVM_AtomicRMWOp with fixed vectors of 16 bit fp values as operands. See also: llvm#94845, llvm#95393, llvm#95394 Signed-off-by: Ilya Veselov <[email protected]>
As far as AMDGPU target supports vectorization for atomic_rmw operation, allow construction of LLVM_AtomicRMWOp with 16 bit floating point values. This patch enables building of LLVM_AtomicRMWOp with fixed vectors of 16 bit fp values as operands. See also: llvm#94845, llvm#95393, llvm#95394 Signed-off-by: Ilya Veselov <[email protected]>
As far as AMDGPU target supports vectorization for `atomic_rmw fadd` operation, enable building of `LLVM_AtomicRMWOp fadd` with fixed vectors of 16 bit fp values as operands. See also: llvm#94845, llvm#95393, llvm#95394 Signed-off-by: Ilya Veselov <[email protected]>
Unlike the existing fadd cases, choose to ignore the requirement for
amdgpu-unsafe-fp-atomics in case of fine-grained memory access. This
is to minimize migration pain to the new atomic control metadata. This
should not break any users, as the atomic intrinsics are still
directly consumed, and clang does not yet produce vector FP atomicrmw.