-
Notifications
You must be signed in to change notification settings - Fork 13.5k
AMDGPU: Allow f16/bf16 for DS_READ_TR16_B64 gfx950 builtins #118297
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
arsenm
merged 1 commit into
main
from
users/arsenm/gfx950/add-f16-bf16-ds_read_tr16_b64-builtins
Dec 2, 2024
Merged
AMDGPU: Allow f16/bf16 for DS_READ_TR16_B64 gfx950 builtins #118297
arsenm
merged 1 commit into
main
from
users/arsenm/gfx950/add-f16-bf16-ds_read_tr16_b64-builtins
Dec 2, 2024
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This was referenced Dec 2, 2024
Merged
This was referenced Dec 2, 2024
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesCo-authored-by: Sirish Pande <[email protected]> Full diff: https://github.com/llvm/llvm-project/pull/118297.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 752a751a25a0c7..4758f6053ccb6d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -467,6 +467,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4f16, "V4hV4h*3", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4bf16, "V4yV4y*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_i8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_u8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index cb9c23b8e0a0d0..8d162d3b8add40 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19726,6 +19726,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
Intrinsic::ID IID;
switch (BuiltinID) {
@@ -19751,6 +19753,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
IID = Intrinsic::amdgcn_ds_read_tr6_b96;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
IID = Intrinsic::amdgcn_ds_read_tr16_b64;
break;
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
index 39fa46d5845f42..91e04430d4973a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
@@ -4,6 +4,8 @@
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v3i __attribute__((ext_vector_type(3)));
typedef short v4s __attribute__((ext_vector_type(4)));
+typedef half v4h __attribute__((ext_vector_type(4)));
+typedef __bf16 v4y __attribute__((ext_vector_type(4)));
// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
@@ -48,3 +50,24 @@ v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr)
{
return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr);
}
+
+// GFX950-LABEL: define dso_local <4 x half> @test_amdgcn_ds_read_b64_tr_b16_v2f16(
+// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// GFX950-NEXT: entry:
+// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) [[INPTR]])
+// GFX950-NEXT: ret <4 x half> [[TMP0]]
+//
+v4h test_amdgcn_ds_read_b64_tr_b16_v2f16(local v4h* inptr)
+{
+ return __builtin_amdgcn_ds_read_tr16_b64_v4f16(inptr);
+}
+
+// GFX950-LABEL: define dso_local <4 x bfloat> @test_amdgcn_ds_read_b64_tr_b16_v2bf16(
+// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// GFX950-NEXT: entry:
+// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) [[INPTR]])
+// GFX950-NEXT: ret <4 x bfloat> [[TMP0]]
+v4y test_amdgcn_ds_read_b64_tr_b16_v2bf16(local v4y* inptr)
+{
+ return __builtin_amdgcn_ds_read_tr16_b64_v4bf16(inptr);
+}
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 7cbd6d2dc62097..ef618727258cf2 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1202,6 +1202,8 @@ let SubtargetPredicate = HasGFX950Insts in {
def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
def : DSLoadTrPat <DS_READ_B96_TR_B6, v3i32, int_amdgcn_ds_read_tr6_b96>;
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4i16, int_amdgcn_ds_read_tr16_b64>;
+ def : DSLoadTrPat <DS_READ_B64_TR_B16, v4f16, int_amdgcn_ds_read_tr16_b64>;
+ def : DSLoadTrPat <DS_READ_B64_TR_B16, v4bf16, int_amdgcn_ds_read_tr16_b64>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index fb1420ee340043..aa5208560817fd 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -305,6 +305,28 @@ bb:
ret void
}
+declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3))
+
+; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep)
+define amdgpu_kernel void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep)
+ store <4 x half> %tmp0, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3))
+
+; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep)
+define amdgpu_kernel void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep)
+ store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
+ ret void
+}
+
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index 0689af0d56268d..8481a3c2ccdb15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -6,6 +6,8 @@ declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3))
declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3))
+declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16.p3(ptr addrspace(3))
+declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3))
define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b64_tr_b4:
@@ -106,3 +108,53 @@ entry:
store <4 x i16> %val, ptr addrspace(1) %use
ret void
}
+
+define amdgpu_ps void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4f16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16.p3(ptr addrspace(3) %gep)
+ store <4 x half> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4bf16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4bf16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3) %gep)
+ store <4 x bfloat> %val, ptr addrspace(1) %use
+ ret void
+}
|
f82bf34
to
10bece7
Compare
This was referenced Dec 2, 2024
shiltian
approved these changes
Dec 2, 2024
Co-authored-by: Sirish Pande <[email protected]>
10bece7
to
46aca3b
Compare
searlmc1
pushed a commit
to ROCm/llvm-project
that referenced
this pull request
Feb 3, 2025
) Co-authored-by: Sirish Pande <[email protected]>
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
backend:AMDGPU
clang:codegen
IR generation bugs: mangling, exceptions, etc.
clang:frontend
Language frontend issues, e.g. anything involving "Sema"
clang
Clang issues not falling into any other category
llvm:analysis
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Co-authored-by: Sirish Pande [email protected]