Skip to content

Commit 0a6ed48

Browse files
author
rtayl
committed
[AMDGPU] Add llvm.amdgcn.raw.atomic.buffer.load intrinsic to support
OpAtomicLoad lowering This adds llvm.amdgcn.raw.atomic.buffer.load intrinsic to support OpAtomicLoad lowering on AMDGPU. Previously this was lowered to llvm.amdgcn.raw.buffer.load which caused the load in some cases to be marked as invariant and hoisted in LICM. Change-Id: I7d3989d98ab02508287223f24c3f7f81d312f0e6
1 parent ab553d6 commit 0a6ed48

File tree

5 files changed

+74
-1
lines changed

5 files changed

+74
-1
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,19 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
949949
def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
950950
def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
951951

952+
class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
953+
[data_ty],
954+
[llvm_v4i32_ty, // rsrc(SGPR)
955+
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
956+
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
957+
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
958+
// bit 1 = slc,
959+
// bit 2 = dlc on gfx10+),
960+
// swizzled buffer (bit 3 = swz))
961+
[IntrArgMemOnly, ImmArg<3>], "", [SDNPMemOperand]>,
962+
AMDGPURsrcIntrinsic<0>;
963+
def int_amdgcn_raw_atomic_buffer_load : AMDGPURawAtomicBufferLoad;
964+
952965
class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
953966
[data_ty],
954967
[llvm_v4i32_ty, // rsrc(SGPR)

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3776,6 +3776,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
37763776
case Intrinsic::amdgcn_struct_tbuffer_store:
37773777
return legalizeBufferStore(MI, MRI, B, true, true);
37783778
case Intrinsic::amdgcn_raw_buffer_load:
3779+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
37793780
case Intrinsic::amdgcn_struct_buffer_load:
37803781
return legalizeBufferLoad(MI, MRI, B, false, false);
37813782
case Intrinsic::amdgcn_raw_buffer_load_format:

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3921,6 +3921,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
39213921
break;
39223922
}
39233923
case Intrinsic::amdgcn_raw_buffer_load:
3924+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
39243925
case Intrinsic::amdgcn_raw_tbuffer_load: {
39253926
// FIXME: Should make intrinsic ID the last operand of the instruction,
39263927
// then this would be the same as store

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -928,7 +928,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
928928
}
929929

930930
Info.flags = MachineMemOperand::MODereferenceable;
931-
if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
931+
if (Attr.hasFnAttribute(Attribute::ReadOnly) ||
932+
IntrID == Intrinsic::amdgcn_raw_atomic_buffer_load) {
932933
Info.opc = ISD::INTRINSIC_W_CHAIN;
933934
// TODO: Account for dmask reducing loaded size.
934935
Info.memVT = memVTFromImageReturn(CI.getType());
@@ -6214,6 +6215,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
62146215
M->getMemOperand(), DAG);
62156216
}
62166217
case Intrinsic::amdgcn_raw_buffer_load:
6218+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
62176219
case Intrinsic::amdgcn_raw_buffer_load_format: {
62186220
const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format;
62196221

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SI
2+
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SI
3+
;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=GFX10
4+
5+
;CHECK-LABEL: {{^}}raw_atomic_buffer_load
6+
;CHECK-LABEL: BB0_1: ; %bb1
7+
;CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
8+
;CHECK-NEXT: s_waitcnt lgkmcnt(0)
9+
;CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 glc
10+
;CHECK-NEXT: s_waitcnt vmcnt(0)
11+
;SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
12+
;GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
13+
;SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
14+
;GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
15+
;SI-NEXT: s_andn2_b64 exec, exec, s[4:5]
16+
;GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
17+
;CHECK-NEXT: s_cbranch_execnz BB0_1
18+
define amdgpu_kernel void @raw_atomic_buffer_load(<4 x i32> %addr) {
19+
bb:
20+
%tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
21+
br label %bb1
22+
bb1:
23+
%0 = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
24+
%1 = icmp eq i32 %0, %tmp0
25+
br i1 %1, label %bb1, label %bb2
26+
bb2:
27+
ret void
28+
}
29+
30+
;CHECK-LABEL: {{^}}raw_nonatomic_buffer_load
31+
;CHECK: ; =>This Inner Loop Header: Depth=1
32+
;SI-NEXT: s_and_b64 s[2:3], exec, vcc
33+
;GFX10-NEXT: s_and_b32 s1, exec_lo, vcc_lo
34+
;SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
35+
;GFX10-NEXT: s_or_b32 s0, s1, s0
36+
;SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
37+
;GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
38+
;CHECK-NEXT: s_cbranch_execnz BB1_1
39+
define amdgpu_kernel void @raw_nonatomic_buffer_load(<4 x i32> %addr) {
40+
bb:
41+
%tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
42+
br label %bb1
43+
bb1:
44+
%0 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
45+
%1 = icmp eq i32 %0, %tmp0
46+
br i1 %1, label %bb1, label %bb2
47+
bb2:
48+
ret void
49+
}
50+
51+
; Function Attrs: nounwind readonly
52+
declare i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
53+
declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
54+
declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg)
55+
declare i32 @llvm.amdgcn.workitem.id.x()
56+

0 commit comments

Comments
 (0)