Skip to content

Commit 1529804

Browse files
arsenmsrpande
authored andcommitted
AMDGPU: Add support for load transpose instructions for gfx950 (llvm#117378)
This patch support for intrinsics in clang, as well as assembly instructions in the backend. Co-authored-by: Sirish Pande <[email protected]>
1 parent 77cfa4f commit 1529804

File tree

13 files changed

+428
-3
lines changed

13 files changed

+428
-3
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,11 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8, "V16fV4iV8iV16fiIiI
462462
TARGET_BUILTIN(__builtin_amdgcn_permlane16_swap, "V2UiUiUiIbIb", "nc", "permlane16-swap")
463463
TARGET_BUILTIN(__builtin_amdgcn_permlane32_swap, "V2UiUiUiIbIb", "nc", "permlane32-swap")
464464

465+
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
466+
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts")
467+
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
468+
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
469+
465470
//===----------------------------------------------------------------------===//
466471
// GFX12+ only builtins.
467472
//===----------------------------------------------------------------------===//

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18830,8 +18830,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1883018830
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
1883118831
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
1883218832
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
18833-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: {
18834-
18833+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
18834+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
18835+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
18836+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
18837+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
1883518838
Intrinsic::ID IID;
1883618839
switch (BuiltinID) {
1883718840
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
@@ -18846,6 +18849,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1884618849
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
1884718850
IID = Intrinsic::amdgcn_global_load_tr_b128;
1884818851
break;
18852+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
18853+
IID = Intrinsic::amdgcn_ds_read_tr4_b64;
18854+
break;
18855+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
18856+
IID = Intrinsic::amdgcn_ds_read_tr8_b64;
18857+
break;
18858+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
18859+
IID = Intrinsic::amdgcn_ds_read_tr6_b96;
18860+
break;
18861+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
18862+
IID = Intrinsic::amdgcn_ds_read_tr16_b64;
18863+
break;
1884918864
}
1885018865
llvm::Type *LoadTy = ConvertType(E->getType());
1885118866
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
2+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s
3+
4+
typedef int v2i __attribute__((ext_vector_type(2)));
5+
typedef int v3i __attribute__((ext_vector_type(3)));
6+
typedef short v4s __attribute__((ext_vector_type(4)));
7+
8+
// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32(
9+
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
10+
// GFX950-NEXT: entry:
11+
// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) [[INPTR]])
12+
// GFX950-NEXT: ret <2 x i32> [[TMP0]]
13+
//
14+
v2i test_amdgcn_ds_read_b64_tr_b4_v2i32(local v2i* inptr)
15+
{
16+
return __builtin_amdgcn_ds_read_tr4_b64_v2i32(inptr);
17+
}
18+
19+
// GFX950-LABEL: define dso_local <3 x i32> @test_amdgcn_ds_read_b96_tr_b6_v3i32(
20+
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
21+
// GFX950-NEXT: entry:
22+
// GFX950-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) [[INPTR]])
23+
// GFX950-NEXT: ret <3 x i32> [[TMP0]]
24+
//
25+
v3i test_amdgcn_ds_read_b96_tr_b6_v3i32(local v3i* inptr)
26+
{
27+
return __builtin_amdgcn_ds_read_tr6_b96_v3i32(inptr);
28+
}
29+
30+
// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b8_v2i32(
31+
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
32+
// GFX950-NEXT: entry:
33+
// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) [[INPTR]])
34+
// GFX950-NEXT: ret <2 x i32> [[TMP0]]
35+
//
36+
v2i test_amdgcn_ds_read_b64_tr_b8_v2i32(local v2i* inptr)
37+
{
38+
return __builtin_amdgcn_ds_read_tr8_b64_v2i32(inptr);
39+
}
40+
41+
// GFX950-LABEL: define dso_local <4 x i16> @test_amdgcn_ds_read_b64_tr_b16_v2i16(
42+
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
43+
// GFX950-NEXT: entry:
44+
// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) [[INPTR]])
45+
// GFX950-NEXT: ret <4 x i16> [[TMP0]]
46+
//
47+
v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr)
48+
{
49+
return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr);
50+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
def global_ptr_ty : LLVMQualPointerType<1>;
14+
def local_ptr_ty : LLVMQualPointerType<3>;
1415

1516
// The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred
1617
// by the backend cause whole-program undefined behavior when violated, such as
@@ -2751,6 +2752,10 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
27512752

27522753
def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
27532754
def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
2755+
def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
2756+
def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
2757+
def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
2758+
def int_amdgcn_ds_read_tr16_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
27542759

27552760
// i32 @llvm.amdgcn.wave.id()
27562761
def int_amdgcn_wave_id :

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4952,6 +4952,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49524952
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
49534953
case Intrinsic::amdgcn_global_load_tr_b64:
49544954
case Intrinsic::amdgcn_global_load_tr_b128:
4955+
case Intrinsic::amdgcn_ds_read_tr4_b64:
4956+
case Intrinsic::amdgcn_ds_read_tr6_b96:
4957+
case Intrinsic::amdgcn_ds_read_tr8_b64:
4958+
case Intrinsic::amdgcn_ds_read_tr16_b64:
49554959
return getDefaultMappingAllVGPR(MI);
49564960
case Intrinsic::amdgcn_ds_ordered_add:
49574961
case Intrinsic::amdgcn_ds_ordered_swap: {

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,11 @@ def : SourceOfDivergence<intr>;
353353
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
354354
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
355355

356+
def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>;
357+
def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>;
358+
def : SourceOfDivergence<int_amdgcn_ds_read_tr8_b64>;
359+
def : SourceOfDivergence<int_amdgcn_ds_read_tr16_b64>;
360+
356361
// The dummy boolean output is divergent from the IR's perspective,
357362
// but the mask results are uniform. These produce a divergent and
358363
// uniform result, so the returned struct is collectively divergent.

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,12 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
296296
}
297297
}
298298

299+
multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
300+
let has_m0_read = 0 in {
301+
def "" : DS_1A_RET<opName, rc>;
302+
}
303+
}
304+
299305
class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
300306
DS_1A_RET<opName, rc, 1>;
301307

@@ -746,6 +752,13 @@ multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
746752
defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
747753
} // let SubtargetPredicate = isGFX12Plus
748754

755+
let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
756+
defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
757+
defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>;
758+
defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", VReg_64>;
759+
defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", VReg_96>;
760+
}
761+
749762
//===----------------------------------------------------------------------===//
750763
// DS Patterns
751764
//===----------------------------------------------------------------------===//
@@ -1178,6 +1191,18 @@ def : GCNPat <
11781191
sub0)
11791192
>;
11801193

1194+
class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPat <
1195+
(vt (node (DS1Addr1Offset i32:$ptr, i32:$offset))),
1196+
(inst $ptr, Offset:$offset, (i1 0))
1197+
>;
1198+
1199+
let SubtargetPredicate = HasGFX950Insts in {
1200+
def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>;
1201+
def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
1202+
def : DSLoadTrPat <DS_READ_B96_TR_B6, v3i32, int_amdgcn_ds_read_tr6_b96>;
1203+
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4i16, int_amdgcn_ds_read_tr16_b64>;
1204+
}
1205+
11811206
//===----------------------------------------------------------------------===//
11821207
// Target-specific instruction encodings.
11831208
//===----------------------------------------------------------------------===//
@@ -1747,3 +1772,11 @@ def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
17471772
def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
17481773
def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
17491774
def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
1775+
1776+
//===----------------------------------------------------------------------===//
1777+
// GFX950.
1778+
//===----------------------------------------------------------------------===//
1779+
def DS_READ_B64_TR_B4_vi : DS_Real_vi<0x0e0, DS_READ_B64_TR_B4>;
1780+
def DS_READ_B96_TR_B6_vi : DS_Real_vi<0x0e1, DS_READ_B96_TR_B6>;
1781+
def DS_READ_B64_TR_B8_vi : DS_Real_vi<0x0e2, DS_READ_B64_TR_B8>;
1782+
def DS_READ_B64_TR_B16_vi : DS_Real_vi<0x0e3, DS_READ_B64_TR_B16>;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1392,7 +1392,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13921392
return true;
13931393
}
13941394
case Intrinsic::amdgcn_global_load_tr_b64:
1395-
case Intrinsic::amdgcn_global_load_tr_b128: {
1395+
case Intrinsic::amdgcn_global_load_tr_b128:
1396+
case Intrinsic::amdgcn_ds_read_tr4_b64:
1397+
case Intrinsic::amdgcn_ds_read_tr6_b96:
1398+
case Intrinsic::amdgcn_ds_read_tr8_b64:
1399+
case Intrinsic::amdgcn_ds_read_tr16_b64: {
13961400
Info.opc = ISD::INTRINSIC_W_CHAIN;
13971401
Info.memVT = MVT::getVT(CI.getType());
13981402
Info.ptrVal = CI.getOperand(0);
@@ -1480,6 +1484,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
14801484
case Intrinsic::amdgcn_atomic_cond_sub_u32:
14811485
case Intrinsic::amdgcn_ds_append:
14821486
case Intrinsic::amdgcn_ds_consume:
1487+
case Intrinsic::amdgcn_ds_read_tr4_b64:
1488+
case Intrinsic::amdgcn_ds_read_tr6_b96:
1489+
case Intrinsic::amdgcn_ds_read_tr8_b64:
1490+
case Intrinsic::amdgcn_ds_read_tr16_b64:
14831491
case Intrinsic::amdgcn_ds_ordered_add:
14841492
case Intrinsic::amdgcn_ds_ordered_swap:
14851493
case Intrinsic::amdgcn_flat_atomic_fadd:

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,50 @@ bb:
261261
ret void
262262
}
263263

264+
declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3))
265+
266+
; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
267+
define amdgpu_kernel void @ds_read_b64_tr4_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
268+
bb:
269+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
270+
%tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
271+
store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
272+
ret void
273+
}
274+
275+
declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3))
276+
277+
; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
278+
define amdgpu_kernel void @ds_read_b96_tr6_v3i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
279+
bb:
280+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
281+
%tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
282+
store <3 x i32> %tmp0, ptr addrspace(1) %out, align 16
283+
ret void
284+
}
285+
286+
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3))
287+
288+
; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
289+
define amdgpu_kernel void @ds_read_b64_tr8_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
290+
bb:
291+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
292+
%tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
293+
store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
294+
ret void
295+
}
296+
297+
declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3))
298+
299+
; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
300+
define amdgpu_kernel void @ds_read_b64_tr_b16_v4i16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
301+
bb:
302+
%gep = getelementptr i64, ptr addrspace(3) %addr, i16 4
303+
%tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
304+
store <4 x i16> %tmp0, ptr addrspace(1) %out, align 16
305+
ret void
306+
}
307+
264308
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
265309
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
266310

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s
3+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
4+
5+
declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
6+
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
7+
declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3))
8+
declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3))
9+
10+
define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
11+
; GFX950-SDAG-LABEL: ds_read_b64_tr_b4:
12+
; GFX950-SDAG: ; %bb.0: ; %entry
13+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
14+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
15+
; GFX950-SDAG-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32
16+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
17+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
18+
; GFX950-SDAG-NEXT: s_endpgm
19+
;
20+
; GFX950-GISEL-LABEL: ds_read_b64_tr_b4:
21+
; GFX950-GISEL: ; %bb.0: ; %entry
22+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
23+
; GFX950-GISEL-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32
24+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
25+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
26+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
27+
; GFX950-GISEL-NEXT: s_endpgm
28+
entry:
29+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
30+
%val = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3) %gep)
31+
store <2 x i32> %val, ptr addrspace(1) %use
32+
ret void
33+
}
34+
35+
define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
36+
; GFX950-SDAG-LABEL: ds_read_b96_tr_b6:
37+
; GFX950-SDAG: ; %bb.0: ; %entry
38+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2
39+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1
40+
; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
41+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
42+
; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
43+
; GFX950-SDAG-NEXT: s_endpgm
44+
;
45+
; GFX950-GISEL-LABEL: ds_read_b96_tr_b6:
46+
; GFX950-GISEL: ; %bb.0: ; %entry
47+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
48+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
49+
; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
50+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
51+
; GFX950-GISEL-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
52+
; GFX950-GISEL-NEXT: s_endpgm
53+
entry:
54+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
55+
%val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
56+
store <3 x i32> %val, ptr addrspace(1) %use
57+
ret void
58+
}
59+
60+
define amdgpu_ps void @ds_read_b64_tr_b8(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
61+
; GFX950-SDAG-LABEL: ds_read_b64_tr_b8:
62+
; GFX950-SDAG: ; %bb.0: ; %entry
63+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
64+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
65+
; GFX950-SDAG-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32
66+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
67+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
68+
; GFX950-SDAG-NEXT: s_endpgm
69+
;
70+
; GFX950-GISEL-LABEL: ds_read_b64_tr_b8:
71+
; GFX950-GISEL: ; %bb.0: ; %entry
72+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
73+
; GFX950-GISEL-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32
74+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
75+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
76+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
77+
; GFX950-GISEL-NEXT: s_endpgm
78+
entry:
79+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
80+
%val = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3) %gep)
81+
store <2 x i32> %val, ptr addrspace(1) %use
82+
ret void
83+
}
84+
85+
define amdgpu_ps void @ds_read_b64_tr_b16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
86+
; GFX950-SDAG-LABEL: ds_read_b64_tr_b16:
87+
; GFX950-SDAG: ; %bb.0: ; %entry
88+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
89+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
90+
; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
91+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
92+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
93+
; GFX950-SDAG-NEXT: s_endpgm
94+
;
95+
; GFX950-GISEL-LABEL: ds_read_b64_tr_b16:
96+
; GFX950-GISEL: ; %bb.0: ; %entry
97+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
98+
; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
99+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
100+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
101+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
102+
; GFX950-GISEL-NEXT: s_endpgm
103+
entry:
104+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
105+
%val = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3) %gep)
106+
store <4 x i16> %val, ptr addrspace(1) %use
107+
ret void
108+
}

0 commit comments

Comments
 (0)