Skip to content

Commit 8eb6914

Browse files
committed
AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat
Unlike the existing fadd cases, choose to ignore the requirement for amdgpu-unsafe-fp-atomics in case of fine-grained memory access. This is to minimize migration pain to the new atomic control metadata. This should not break any users, as the atomic intrinsics are still directly consumed, and clang does not yet produce vector FP atomicrmw.
1 parent 86c3c5a commit 8eb6914

12 files changed

+6787
-3505
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1659,6 +1659,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
16591659
});
16601660
}
16611661

1662+
if (ST.hasAtomicBufferGlobalPkAddF16Insts())
1663+
Atomic.legalFor({{V2F16, GlobalPtr}});
1664+
if (ST.hasAtomicGlobalPkAddBF16Inst())
1665+
Atomic.legalFor({{V2BF16, GlobalPtr}});
1666+
if (ST.hasAtomicFlatPkAdd16Insts())
1667+
Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1668+
16621669
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
16631670
// demarshalling
16641671
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1645,6 +1645,7 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc
16451645
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
16461646
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
16471647
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
1648+
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
16481649
}
16491650

16501651
let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
@@ -1669,13 +1670,16 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat
16691670
}
16701671

16711672
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
1673+
// FIXME: These do not have signed offsets
16721674
defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>;
16731675
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
1676+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
1677+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
16741678
}
16751679

16761680
let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
16771681
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
1678-
1682+
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
16791683
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
16801684

16811685
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15941,6 +15941,16 @@ static bool isHalf2OrBFloat2(Type *Ty) {
1594115941
return false;
1594215942
}
1594315943

15944+
static bool isHalf2(Type *Ty) {
15945+
FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
15946+
return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
15947+
}
15948+
15949+
static bool isBFloat2(Type *Ty) {
15950+
FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
15951+
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
15952+
}
15953+
1594415954
TargetLowering::AtomicExpansionKind
1594515955
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1594615956
unsigned AS = RMW->getPointerAddressSpace();
@@ -16009,10 +16019,27 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1600916019
AS != AMDGPUAS::BUFFER_FAT_POINTER)
1601016020
return AtomicExpansionKind::CmpXChg;
1601116021

16012-
// TODO: gfx940 supports v2f16 and v2bf16
1601316022
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
1601416023
return AtomicExpansionKind::None;
1601516024

16025+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16026+
// gfx940, gfx12
16027+
// FIXME: Needs to account for no fine-grained memory
16028+
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16029+
return AtomicExpansionKind::None;
16030+
} else {
16031+
// gfx90a, gfx940, gfx12
16032+
// FIXME: Needs to account for no fine-grained memory
16033+
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16034+
return AtomicExpansionKind::None;
16035+
16036+
// gfx940, gfx12
16037+
// FIXME: Need to skip buffer_fat_pointer?
16038+
// FIXME: Needs to account for no fine-grained memory
16039+
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16040+
return AtomicExpansionKind::None;
16041+
}
16042+
1601616043
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
1601716044
return AtomicExpansionKind::CmpXChg;
1601816045

llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll

Lines changed: 4 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -237,24 +237,10 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p
237237
; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
238238
; GFX940: ; %bb.0:
239239
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240-
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024
241-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
242-
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
243-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
244-
; GFX940-NEXT: s_waitcnt vmcnt(0)
245-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
246-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
247240
; GFX940-NEXT: buffer_wbl2 sc1
248-
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
241+
; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0
249242
; GFX940-NEXT: s_waitcnt vmcnt(0)
250243
; GFX940-NEXT: buffer_inv sc1
251-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
252-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
253-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
254-
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
255-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
256-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
257-
; GFX940-NEXT: v_mov_b32_e32 v0, v3
258244
; GFX940-NEXT: s_setpc_b64 s[30:31]
259245
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
260246
%result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -265,23 +251,10 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr,
265251
; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
266252
; GFX940: ; %bb.0:
267253
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268-
; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024
269-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
270-
; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
271-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
272-
; GFX940-NEXT: s_waitcnt vmcnt(0)
273-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
274254
; GFX940-NEXT: buffer_wbl2 sc1
275-
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
255+
; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024
276256
; GFX940-NEXT: s_waitcnt vmcnt(0)
277257
; GFX940-NEXT: buffer_inv sc1
278-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
279-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
280-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
281-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
282-
; GFX940-NEXT: s_cbranch_execnz .LBB18_1
283-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
284-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
285258
; GFX940-NEXT: s_setpc_b64 s[30:31]
286259
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
287260
%unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -292,24 +265,10 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
292265
; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
293266
; GFX940: ; %bb.0:
294267
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295-
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024
296-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
297-
; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
298-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
299-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
300-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
301-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
302268
; GFX940-NEXT: buffer_wbl2 sc1
303-
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
269+
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0
304270
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
305271
; GFX940-NEXT: buffer_inv sc1
306-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
307-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
308-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
309-
; GFX940-NEXT: s_cbranch_execnz .LBB19_1
310-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
311-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
312-
; GFX940-NEXT: v_mov_b32_e32 v0, v3
313272
; GFX940-NEXT: s_setpc_b64 s[30:31]
314273
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
315274
%result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -320,23 +279,10 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
320279
; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
321280
; GFX940: ; %bb.0:
322281
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323-
; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024
324-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
325-
; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
326-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
327-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
328-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
329282
; GFX940-NEXT: buffer_wbl2 sc1
330-
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
283+
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024
331284
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
332285
; GFX940-NEXT: buffer_inv sc1
333-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
334-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
335-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
336-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
337-
; GFX940-NEXT: s_cbranch_execnz .LBB20_1
338-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
339-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
340286
; GFX940-NEXT: s_setpc_b64 s[30:31]
341287
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
342288
%unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst

0 commit comments

Comments
 (0)