Skip to content

AMDGPU: Allow f16/bf16 for DS_READ_TR16_B64 gfx950 builtins #118297

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4f16, "V4hV4h*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4bf16, "V4yV4y*3", "nc", "gfx950-insts")

TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_i8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_u8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19726,6 +19726,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
Intrinsic::ID IID;
switch (BuiltinID) {
Expand All @@ -19751,6 +19753,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
IID = Intrinsic::amdgcn_ds_read_tr6_b96;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
IID = Intrinsic::amdgcn_ds_read_tr16_b64;
break;
}
Expand Down
23 changes: 23 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v3i __attribute__((ext_vector_type(3)));
typedef short v4s __attribute__((ext_vector_type(4)));
typedef half v4h __attribute__((ext_vector_type(4)));
typedef __bf16 v4y __attribute__((ext_vector_type(4)));

// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
Expand Down Expand Up @@ -48,3 +50,24 @@ v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr)
{
return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr);
}

// GFX950-LABEL: define dso_local <4 x half> @test_amdgcn_ds_read_b64_tr_b16_v2f16(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// GFX950-NEXT: entry:
// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) [[INPTR]])
// GFX950-NEXT: ret <4 x half> [[TMP0]]
//
v4h test_amdgcn_ds_read_b64_tr_b16_v2f16(local v4h* inptr)
{
return __builtin_amdgcn_ds_read_tr16_b64_v4f16(inptr);
}

// GFX950-LABEL: define dso_local <4 x bfloat> @test_amdgcn_ds_read_b64_tr_b16_v2bf16(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// GFX950-NEXT: entry:
// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) [[INPTR]])
// GFX950-NEXT: ret <4 x bfloat> [[TMP0]]
v4y test_amdgcn_ds_read_b64_tr_b16_v2bf16(local v4y* inptr)
{
return __builtin_amdgcn_ds_read_tr16_b64_v4bf16(inptr);
}
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/DSInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1202,6 +1202,8 @@ let SubtargetPredicate = HasGFX950Insts in {
def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
def : DSLoadTrPat <DS_READ_B96_TR_B6, v3i32, int_amdgcn_ds_read_tr6_b96>;
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4i16, int_amdgcn_ds_read_tr16_b64>;
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4f16, int_amdgcn_ds_read_tr16_b64>;
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4bf16, int_amdgcn_ds_read_tr16_b64>;
}

//===----------------------------------------------------------------------===//
Expand Down
22 changes: 22 additions & 0 deletions llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,28 @@ bb:
ret void
}

declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3))

; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep)
define amdgpu_kernel void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
bb:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep)
store <4 x half> %tmp0, ptr addrspace(1) %out, align 16
ret void
}

declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3))

; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep)
define amdgpu_kernel void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
bb:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep)
store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
ret void
}

declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)

Expand Down
52 changes: 52 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3))
declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3))
declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16.p3(ptr addrspace(3))
declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3))

define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b64_tr_b4:
Expand Down Expand Up @@ -106,3 +108,53 @@ entry:
store <4 x i16> %val, ptr addrspace(1) %use
ret void
}

define amdgpu_ps void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4f16:
; GFX950-SDAG: ; %bb.0: ; %entry
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4f16:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX950-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16.p3(ptr addrspace(3) %gep)
store <4 x half> %val, ptr addrspace(1) %use
ret void
}

define amdgpu_ps void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4bf16:
; GFX950-SDAG: ; %bb.0: ; %entry
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4bf16:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX950-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3) %gep)
store <4 x bfloat> %val, ptr addrspace(1) %use
ret void
}
Loading