Skip to content

Commit 5021e6d

Browse files
authored
AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat (#95394)
Unlike the existing fadd cases, choose to ignore the requirement for amdgpu-unsafe-fp-atomics in case of fine-grained memory access. This is to minimize migration pain to the new atomic control metadata. This should not break any users, as the atomic intrinsics are still directly consumed, and clang does not yet produce vector FP atomicrmw.
1 parent 0a9a5f9 commit 5021e6d

10 files changed

+6663
-1347
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1659,6 +1659,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
16591659
});
16601660
}
16611661

1662+
if (ST.hasAtomicBufferGlobalPkAddF16Insts())
1663+
Atomic.legalFor({{V2F16, GlobalPtr}});
1664+
if (ST.hasAtomicGlobalPkAddBF16Inst())
1665+
Atomic.legalFor({{V2BF16, GlobalPtr}});
1666+
if (ST.hasAtomicFlatPkAdd16Insts())
1667+
Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1668+
16621669
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
16631670
// demarshalling
16641671
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1645,6 +1645,7 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc
16451645
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
16461646
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
16471647
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
1648+
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
16481649
}
16491650

16501651
let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
@@ -1669,13 +1670,16 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat
16691670
}
16701671

16711672
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
1673+
// FIXME: These do not have signed offsets
16721674
defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>;
16731675
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
1676+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
1677+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
16741678
}
16751679

16761680
let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
16771681
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
1678-
1682+
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
16791683
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
16801684

16811685
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15942,6 +15942,16 @@ static bool isHalf2OrBFloat2(Type *Ty) {
1594215942
return false;
1594315943
}
1594415944

15945+
static bool isHalf2(Type *Ty) {
15946+
FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
15947+
return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
15948+
}
15949+
15950+
static bool isBFloat2(Type *Ty) {
15951+
FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
15952+
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
15953+
}
15954+
1594515955
TargetLowering::AtomicExpansionKind
1594615956
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1594715957
unsigned AS = RMW->getPointerAddressSpace();
@@ -16010,10 +16020,27 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1601016020
AS != AMDGPUAS::BUFFER_FAT_POINTER)
1601116021
return AtomicExpansionKind::CmpXChg;
1601216022

16013-
// TODO: gfx940 supports v2f16 and v2bf16
1601416023
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
1601516024
return AtomicExpansionKind::None;
1601616025

16026+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16027+
// gfx940, gfx12
16028+
// FIXME: Needs to account for no fine-grained memory
16029+
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16030+
return AtomicExpansionKind::None;
16031+
} else {
16032+
// gfx90a, gfx940, gfx12
16033+
// FIXME: Needs to account for no fine-grained memory
16034+
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16035+
return AtomicExpansionKind::None;
16036+
16037+
// gfx940, gfx12
16038+
// FIXME: Need to skip buffer_fat_pointer?
16039+
// FIXME: Needs to account for no fine-grained memory
16040+
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16041+
return AtomicExpansionKind::None;
16042+
}
16043+
1601716044
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
1601816045
return AtomicExpansionKind::CmpXChg;
1601916046

llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll

Lines changed: 4 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -237,24 +237,10 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p
237237
; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
238238
; GFX940: ; %bb.0:
239239
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240-
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024
241-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
242-
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
243-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
244-
; GFX940-NEXT: s_waitcnt vmcnt(0)
245-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
246-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
247240
; GFX940-NEXT: buffer_wbl2 sc1
248-
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
241+
; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0
249242
; GFX940-NEXT: s_waitcnt vmcnt(0)
250243
; GFX940-NEXT: buffer_inv sc1
251-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
252-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
253-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
254-
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
255-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
256-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
257-
; GFX940-NEXT: v_mov_b32_e32 v0, v3
258244
; GFX940-NEXT: s_setpc_b64 s[30:31]
259245
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
260246
%result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -265,23 +251,10 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr,
265251
; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
266252
; GFX940: ; %bb.0:
267253
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268-
; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024
269-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
270-
; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
271-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
272-
; GFX940-NEXT: s_waitcnt vmcnt(0)
273-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
274254
; GFX940-NEXT: buffer_wbl2 sc1
275-
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
255+
; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024
276256
; GFX940-NEXT: s_waitcnt vmcnt(0)
277257
; GFX940-NEXT: buffer_inv sc1
278-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
279-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
280-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
281-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
282-
; GFX940-NEXT: s_cbranch_execnz .LBB18_1
283-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
284-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
285258
; GFX940-NEXT: s_setpc_b64 s[30:31]
286259
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
287260
%unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -292,24 +265,10 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
292265
; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
293266
; GFX940: ; %bb.0:
294267
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295-
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024
296-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
297-
; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
298-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
299-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
300-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
301-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
302268
; GFX940-NEXT: buffer_wbl2 sc1
303-
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
269+
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0
304270
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
305271
; GFX940-NEXT: buffer_inv sc1
306-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
307-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
308-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
309-
; GFX940-NEXT: s_cbranch_execnz .LBB19_1
310-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
311-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
312-
; GFX940-NEXT: v_mov_b32_e32 v0, v3
313272
; GFX940-NEXT: s_setpc_b64 s[30:31]
314273
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
315274
%result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -320,23 +279,10 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
320279
; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
321280
; GFX940: ; %bb.0:
322281
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323-
; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024
324-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
325-
; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
326-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
327-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
328-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
329282
; GFX940-NEXT: buffer_wbl2 sc1
330-
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
283+
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024
331284
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
332285
; GFX940-NEXT: buffer_inv sc1
333-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
334-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
335-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
336-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
337-
; GFX940-NEXT: s_cbranch_execnz .LBB20_1
338-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
339-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
340286
; GFX940-NEXT: s_setpc_b64 s[30:31]
341287
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
342288
%unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst

0 commit comments

Comments
 (0)