Skip to content

Commit cf025c7

Browse files
authored
[AMDGPU] GFX12 global_atomic_ordered_add_b64 instruction and intrinsic (#76149)
1 parent 5842dfe commit cf025c7

10 files changed

+124
-5
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
def global_ptr_ty : LLVMQualPointerType<1>;
14+
1315
class AMDGPUReadPreloadRegisterIntrinsic
1416
: DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
1517

@@ -2353,10 +2355,10 @@ def int_amdgcn_s_get_waveid_in_workgroup :
23532355
Intrinsic<[llvm_i32_ty], [],
23542356
[IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
23552357

2356-
class AMDGPUAtomicRtn<LLVMType vt> : Intrinsic <
2358+
class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic <
23572359
[vt],
2358-
[llvm_anyptr_ty, // vaddr
2359-
vt], // vdata(VGPR)
2360+
[pt, // vaddr
2361+
vt], // vdata(VGPR)
23602362
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
23612363
[SDNPMemOperand]>;
23622364

@@ -2486,6 +2488,8 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
24862488
[IntrNoMem, IntrConvergent, IntrWillReturn,
24872489
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
24882490

2491+
def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
2492+
24892493
def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
24902494
def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
24912495
def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,7 @@ defm int_amdgcn_global_atomic_fmax : noret_op;
642642
defm int_amdgcn_global_atomic_csub : noret_op;
643643
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
644644
defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
645+
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
645646
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
646647
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
647648
defm int_amdgcn_global_atomic_fmin_num : noret_op;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4690,6 +4690,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
46904690
case Intrinsic::amdgcn_flat_atomic_fmax_num:
46914691
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
46924692
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4693+
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
46934694
return getDefaultMappingAllVGPR(MI);
46944695
case Intrinsic::amdgcn_ds_ordered_add:
46954696
case Intrinsic::amdgcn_ds_ordered_swap:

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
243243
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
244244
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
245245
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
246+
def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
246247
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
247248
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
248249
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -926,9 +926,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho
926926
defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
927927
defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
928928

929-
} // End is_flat_global = 1
930-
929+
let SubtargetPredicate = isGFX12Plus in {
930+
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
931+
} // End SubtargetPredicate = isGFX12Plus
931932

933+
} // End is_flat_global = 1
932934

933935
let SubtargetPredicate = HasFlatScratchInsts in {
934936
defm SCRATCH_LOAD_UBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>;
@@ -1529,6 +1531,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64>
15291531
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>;
15301532
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
15311533

1534+
let OtherPredicates = [isGFX12Plus] in {
1535+
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
1536+
}
1537+
15321538
let OtherPredicates = [isGFX10Plus] in {
15331539
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
15341540
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
@@ -2654,6 +2660,7 @@ defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_A
26542660
defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">;
26552661
defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">;
26562662
defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
2663+
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;
26572664

26582665
// ENC_VSCRATCH.
26592666
defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,6 +1240,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12401240
case Intrinsic::amdgcn_global_atomic_fmax:
12411241
case Intrinsic::amdgcn_global_atomic_fmin_num:
12421242
case Intrinsic::amdgcn_global_atomic_fmax_num:
1243+
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
12431244
case Intrinsic::amdgcn_flat_atomic_fadd:
12441245
case Intrinsic::amdgcn_flat_atomic_fmin:
12451246
case Intrinsic::amdgcn_flat_atomic_fmax:
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
3+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
4+
5+
declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64)
6+
7+
define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) {
8+
; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn:
9+
; GFX12-SDAG: ; %bb.0: ; %entry
10+
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
11+
; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
12+
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
13+
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
14+
; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
15+
; GFX12-SDAG-NEXT: s_endpgm
16+
;
17+
; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn:
18+
; GFX12-GISEL: ; %bb.0: ; %entry
19+
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
20+
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
21+
; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
22+
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
23+
; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
24+
; GFX12-GISEL-NEXT: s_endpgm
25+
entry:
26+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 -4
27+
%unused = call i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1) %gep, i64 %in)
28+
ret void
29+
}
30+
31+
define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) {
32+
; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn:
33+
; GFX12-SDAG: ; %bb.0: ; %entry
34+
; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
35+
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
36+
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
37+
; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
38+
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
39+
; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
40+
; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
41+
; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
42+
; GFX12-SDAG-NEXT: s_nop 0
43+
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
44+
; GFX12-SDAG-NEXT: s_endpgm
45+
;
46+
; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_rtn:
47+
; GFX12-GISEL: ; %bb.0: ; %entry
48+
; GFX12-GISEL-NEXT: s_clause 0x1
49+
; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
50+
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
51+
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
52+
; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
53+
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
54+
; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
55+
; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
56+
; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
57+
; GFX12-GISEL-NEXT: s_nop 0
58+
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
59+
; GFX12-GISEL-NEXT: s_endpgm
60+
entry:
61+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
62+
%val = call i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1) %gep, i64 %in)
63+
store i64 %val, ptr addrspace(1) %use
64+
ret void
65+
}

llvm/test/MC/AMDGPU/gfx11_unsupported.s

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2013,3 +2013,6 @@ ds_sub_clamp_rtn_u32 v5, v1, v2
20132013

20142014
ds_sub_clamp_u32 v1, v2
20152015
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
2016+
2017+
global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64
2018+
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

llvm/test/MC/AMDGPU/gfx12_asm_vflat.s

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,6 +1266,30 @@ global_atomic_or_b64 v[1:2], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN
12661266
global_atomic_or_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN
12671267
// GFX12: encoding: [0x7c,0x80,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
12681268

1269+
global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:-64
1270+
// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
1271+
1272+
global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64
1273+
// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
1274+
1275+
global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:-64
1276+
// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
1277+
1278+
global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:64
1279+
// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
1280+
1281+
global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN
1282+
// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
1283+
1284+
global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN
1285+
// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
1286+
1287+
global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN
1288+
// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
1289+
1290+
global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN
1291+
// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
1292+
12691293
global_atomic_sub_u32 v0, v2, s[0:1] offset:-64
12701294
// GFX12: encoding: [0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
12711295

llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,18 @@
837837
# GFX12: global_atomic_xor_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
838838
0x7c,0xc0,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
839839

840+
# GFX12: global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
841+
0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
842+
843+
# GFX12: global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
844+
0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
845+
846+
# GFX12: global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
847+
0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
848+
849+
# GFX12: global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
850+
0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
851+
840852
# GFX12: global_load_addtid_b32 v1, off offset:64 ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
841853
0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
842854

0 commit comments

Comments
 (0)