Skip to content

Commit 7c8ff4b

Browse files
arsenmsrpande
authored andcommitted
AMDGPU: Allow f16/bf16 for DS_READ_TR16_B64 gfx950 builtins (llvm#118297)
Co-authored-by: Sirish Pande <[email protected]>
1 parent 8d7c2cc commit 7c8ff4b

File tree

6 files changed

+105
-0
lines changed

6 files changed

+105
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950
467467
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts")
468468
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
469469
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
470+
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4f16, "V4hV4h*3", "nc", "gfx950-insts")
471+
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4bf16, "V4yV4y*3", "nc", "gfx950-insts")
470472

471473
TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_i8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
472474
TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_u8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18846,6 +18846,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1884618846
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
1884718847
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
1884818848
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
18849+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
18850+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
1884918851
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
1885018852
Intrinsic::ID IID;
1885118853
switch (BuiltinID) {
@@ -18871,6 +18873,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1887118873
IID = Intrinsic::amdgcn_ds_read_tr6_b96;
1887218874
break;
1887318875
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
18876+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
18877+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
1887418878
IID = Intrinsic::amdgcn_ds_read_tr16_b64;
1887518879
break;
1887618880
}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
typedef int v2i __attribute__((ext_vector_type(2)));
55
typedef int v3i __attribute__((ext_vector_type(3)));
66
typedef short v4s __attribute__((ext_vector_type(4)));
7+
typedef half v4h __attribute__((ext_vector_type(4)));
8+
typedef __bf16 v4y __attribute__((ext_vector_type(4)));
79

810
// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32(
911
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
@@ -48,3 +50,24 @@ v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr)
4850
{
4951
return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr);
5052
}
53+
54+
// GFX950-LABEL: define dso_local <4 x half> @test_amdgcn_ds_read_b64_tr_b16_v2f16(
55+
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
56+
// GFX950-NEXT: entry:
57+
// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) [[INPTR]])
58+
// GFX950-NEXT: ret <4 x half> [[TMP0]]
59+
//
60+
v4h test_amdgcn_ds_read_b64_tr_b16_v2f16(local v4h* inptr)
61+
{
62+
return __builtin_amdgcn_ds_read_tr16_b64_v4f16(inptr);
63+
}
64+
65+
// GFX950-LABEL: define dso_local <4 x bfloat> @test_amdgcn_ds_read_b64_tr_b16_v2bf16(
66+
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
67+
// GFX950-NEXT: entry:
68+
// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) [[INPTR]])
69+
// GFX950-NEXT: ret <4 x bfloat> [[TMP0]]
70+
v4y test_amdgcn_ds_read_b64_tr_b16_v2bf16(local v4y* inptr)
71+
{
72+
return __builtin_amdgcn_ds_read_tr16_b64_v4bf16(inptr);
73+
}

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1201,6 +1201,8 @@ let SubtargetPredicate = HasGFX950Insts in {
12011201
def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
12021202
def : DSLoadTrPat <DS_READ_B96_TR_B6, v3i32, int_amdgcn_ds_read_tr6_b96>;
12031203
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4i16, int_amdgcn_ds_read_tr16_b64>;
1204+
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4f16, int_amdgcn_ds_read_tr16_b64>;
1205+
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4bf16, int_amdgcn_ds_read_tr16_b64>;
12041206
}
12051207

12061208
//===----------------------------------------------------------------------===//

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,28 @@ bb:
305305
ret void
306306
}
307307

308+
declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3))
309+
310+
; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep)
311+
define amdgpu_kernel void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
312+
bb:
313+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
314+
%tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep)
315+
store <4 x half> %tmp0, ptr addrspace(1) %out, align 16
316+
ret void
317+
}
318+
319+
declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3))
320+
321+
; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep)
322+
define amdgpu_kernel void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
323+
bb:
324+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
325+
%tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep)
326+
store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
327+
ret void
328+
}
329+
308330
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
309331
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
310332

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
66
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
77
declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3))
88
declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3))
9+
declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16.p3(ptr addrspace(3))
10+
declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3))
911

1012
define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
1113
; GFX950-SDAG-LABEL: ds_read_b64_tr_b4:
@@ -106,3 +108,53 @@ entry:
106108
store <4 x i16> %val, ptr addrspace(1) %use
107109
ret void
108110
}
111+
112+
define amdgpu_ps void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
113+
; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4f16:
114+
; GFX950-SDAG: ; %bb.0: ; %entry
115+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
116+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
117+
; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
118+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
119+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
120+
; GFX950-SDAG-NEXT: s_endpgm
121+
;
122+
; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4f16:
123+
; GFX950-GISEL: ; %bb.0: ; %entry
124+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
125+
; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
126+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
127+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
128+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
129+
; GFX950-GISEL-NEXT: s_endpgm
130+
entry:
131+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
132+
%val = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16.p3(ptr addrspace(3) %gep)
133+
store <4 x half> %val, ptr addrspace(1) %use
134+
ret void
135+
}
136+
137+
define amdgpu_ps void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
138+
; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4bf16:
139+
; GFX950-SDAG: ; %bb.0: ; %entry
140+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
141+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
142+
; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
143+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
144+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
145+
; GFX950-SDAG-NEXT: s_endpgm
146+
;
147+
; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4bf16:
148+
; GFX950-GISEL: ; %bb.0: ; %entry
149+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
150+
; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
151+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
152+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
153+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
154+
; GFX950-GISEL-NEXT: s_endpgm
155+
entry:
156+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
157+
%val = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3) %gep)
158+
store <4 x bfloat> %val, ptr addrspace(1) %use
159+
ret void
160+
}

0 commit comments

Comments
 (0)