Skip to content

Commit 4cf1a19

Browse files
committed
Reapply "AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat (#95394)"
This reverts commit 95b77d9.
1 parent a1bdb01 commit 4cf1a19

12 files changed

+6753
-3105
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1659,6 +1659,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
16591659
});
16601660
}
16611661

1662+
if (ST.hasAtomicBufferGlobalPkAddF16Insts())
1663+
Atomic.legalFor({{V2F16, GlobalPtr}});
1664+
if (ST.hasAtomicGlobalPkAddBF16Inst())
1665+
Atomic.legalFor({{V2BF16, GlobalPtr}});
1666+
if (ST.hasAtomicFlatPkAdd16Insts())
1667+
Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1668+
16621669
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
16631670
// demarshalling
16641671
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1645,6 +1645,7 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc
16451645
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
16461646
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
16471647
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
1648+
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
16481649
}
16491650

16501651
let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
@@ -1669,13 +1670,16 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat
16691670
}
16701671

16711672
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
1673+
// FIXME: These do not have signed offsets
16721674
defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>;
16731675
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
1676+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
1677+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
16741678
}
16751679

16761680
let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
16771681
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
1678-
1682+
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
16791683
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
16801684

16811685
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15944,6 +15944,16 @@ static bool isHalf2OrBFloat2(Type *Ty) {
1594415944
return false;
1594515945
}
1594615946

15947+
static bool isHalf2(Type *Ty) {
15948+
FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
15949+
return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
15950+
}
15951+
15952+
static bool isBFloat2(Type *Ty) {
15953+
FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
15954+
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
15955+
}
15956+
1594715957
TargetLowering::AtomicExpansionKind
1594815958
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1594915959
unsigned AS = RMW->getPointerAddressSpace();
@@ -16012,10 +16022,29 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1601216022
AS != AMDGPUAS::BUFFER_FAT_POINTER)
1601316023
return AtomicExpansionKind::CmpXChg;
1601416024

16015-
// TODO: gfx940 supports v2f16 and v2bf16
1601616025
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
1601716026
return AtomicExpansionKind::None;
1601816027

16028+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16029+
// gfx940, gfx12
16030+
// FIXME: Needs to account for no fine-grained memory
16031+
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16032+
return AtomicExpansionKind::None;
16033+
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16034+
// gfx90a, gfx940, gfx12
16035+
// FIXME: Needs to account for no fine-grained memory
16036+
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16037+
return AtomicExpansionKind::None;
16038+
16039+
// gfx940, gfx12
16040+
// FIXME: Needs to account for no fine-grained memory
16041+
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16042+
return AtomicExpansionKind::None;
16043+
}
16044+
16045+
// TODO: Handle buffer case. gfx90a and gfx940 supports <2 x half>. gfx12
16046+
// supports <2 x half> and <2 x bfloat>.
16047+
1601916048
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
1602016049
return AtomicExpansionKind::CmpXChg;
1602116050

llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll

Lines changed: 4 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -237,24 +237,10 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p
237237
; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
238238
; GFX940: ; %bb.0:
239239
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240-
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024
241-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
242-
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
243-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
244-
; GFX940-NEXT: s_waitcnt vmcnt(0)
245-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
246-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
247240
; GFX940-NEXT: buffer_wbl2 sc1
248-
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
241+
; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0
249242
; GFX940-NEXT: s_waitcnt vmcnt(0)
250243
; GFX940-NEXT: buffer_inv sc1
251-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
252-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
253-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
254-
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
255-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
256-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
257-
; GFX940-NEXT: v_mov_b32_e32 v0, v3
258244
; GFX940-NEXT: s_setpc_b64 s[30:31]
259245
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
260246
%result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -265,23 +251,10 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr,
265251
; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
266252
; GFX940: ; %bb.0:
267253
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268-
; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024
269-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
270-
; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
271-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
272-
; GFX940-NEXT: s_waitcnt vmcnt(0)
273-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
274254
; GFX940-NEXT: buffer_wbl2 sc1
275-
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
255+
; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024
276256
; GFX940-NEXT: s_waitcnt vmcnt(0)
277257
; GFX940-NEXT: buffer_inv sc1
278-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
279-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
280-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
281-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
282-
; GFX940-NEXT: s_cbranch_execnz .LBB18_1
283-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
284-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
285258
; GFX940-NEXT: s_setpc_b64 s[30:31]
286259
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
287260
%unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -292,24 +265,10 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
292265
; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
293266
; GFX940: ; %bb.0:
294267
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295-
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024
296-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
297-
; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
298-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
299-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
300-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
301-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
302268
; GFX940-NEXT: buffer_wbl2 sc1
303-
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
269+
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0
304270
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
305271
; GFX940-NEXT: buffer_inv sc1
306-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
307-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
308-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
309-
; GFX940-NEXT: s_cbranch_execnz .LBB19_1
310-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
311-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
312-
; GFX940-NEXT: v_mov_b32_e32 v0, v3
313272
; GFX940-NEXT: s_setpc_b64 s[30:31]
314273
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
315274
%result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -320,23 +279,10 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
320279
; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
321280
; GFX940: ; %bb.0:
322281
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323-
; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024
324-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
325-
; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
326-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
327-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
328-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
329282
; GFX940-NEXT: buffer_wbl2 sc1
330-
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
283+
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024
331284
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
332285
; GFX940-NEXT: buffer_inv sc1
333-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
334-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
335-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
336-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
337-
; GFX940-NEXT: s_cbranch_execnz .LBB20_1
338-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
339-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
340286
; GFX940-NEXT: s_setpc_b64 s[30:31]
341287
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
342288
%unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst

0 commit comments

Comments
 (0)