Skip to content

Commit 42f311c

Browse files
committed
AMDGPU: Handle gfx950 global_load_lds_* instructions
Define global_load_lds_dwordx3 and global_load_dwordx4. Oddly it seems dwordx2 was skipped.
1 parent 82bb6e0 commit 42f311c

File tree

8 files changed

+236
-1
lines changed

8 files changed

+236
-1
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2452,7 +2452,7 @@ class AMDGPUGlobalLoadLDS :
24522452
[],
24532453
[LLVMQualPointerType<1>, // Base global pointer to load from
24542454
LLVMQualPointerType<3>, // LDS base pointer to store to
2455-
llvm_i32_ty, // Data byte size: 1/2/4
2455+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
24562456
llvm_i32_ty, // imm offset (applied to both global and LDS address)
24572457
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
24582458
// bit 1 = sc1,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3329,6 +3329,16 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
33293329
case 4:
33303330
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
33313331
break;
3332+
case 12:
3333+
if (!Subtarget->hasLDSLoadB96_B128())
3334+
return false;
3335+
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3336+
break;
3337+
case 16:
3338+
if (!Subtarget->hasLDSLoadB96_B128())
3339+
return false;
3340+
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3341+
break;
33323342
}
33333343

33343344
MachineBasicBlock *MBB = MI.getParent();

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -934,6 +934,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho
934934
defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
935935
defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
936936

937+
let SubtargetPredicate = HasGFX950Insts in {
938+
defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3">;
939+
defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4">;
940+
}
941+
937942
let SubtargetPredicate = isGFX12Plus in {
938943
defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>;
939944
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
@@ -1980,6 +1985,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS <0x028, 0x12>;
19801985
defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
19811986
defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;
19821987

1988+
defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>;
1989+
defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>;
1990+
1991+
19831992
defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Real_Atomics_vi <0x40>;
19841993
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Real_Atomics_vi <0x41>;
19851994
defm GLOBAL_ATOMIC_ADD : FLAT_Global_Real_Atomics_vi <0x42>;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12891289
// hasGFX940Insts and hasGFX90AInsts are also true.
12901290
bool hasGFX950Insts() const { return GFX950Insts; }
12911291

1292+
/// Returns true if the target supports
1293+
/// global_load_lds_dwordx3/global_load_lds_dwordx4 or
1294+
/// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
1295+
bool hasLDSLoadB96_B128() const {
1296+
return hasGFX950Insts();
1297+
}
1298+
12921299
bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
12931300

12941301
bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9894,6 +9894,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
98949894
case 4:
98959895
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
98969896
break;
9897+
case 12:
9898+
if (!Subtarget->hasLDSLoadB96_B128())
9899+
return SDValue();
9900+
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
9901+
break;
9902+
case 16:
9903+
if (!Subtarget->hasLDSLoadB96_B128())
9904+
return SDValue();
9905+
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
9906+
break;
98979907
}
98989908

98999909
auto *M = cast<MemSDNode>(Op);
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
4+
5+
declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
6+
7+
;---------------------------------------------------------------------y
8+
; dwordx3
9+
;---------------------------------------------------------------------
10+
11+
define amdgpu_ps void @global_load_lds_dwordx3_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
12+
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_vaddr:
13+
; GFX950-SDAG: ; %bb.0:
14+
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s0, v2
15+
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
16+
; GFX950-SDAG-NEXT: s_nop 0
17+
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
18+
; GFX950-SDAG-NEXT: s_endpgm
19+
;
20+
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_vaddr:
21+
; GFX950-GISEL: ; %bb.0:
22+
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v2
23+
; GFX950-GISEL-NEXT: s_nop 4
24+
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
25+
; GFX950-GISEL-NEXT: s_endpgm
26+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
27+
ret void
28+
}
29+
30+
define amdgpu_ps void @global_load_lds_dwordx3_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) {
31+
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr:
32+
; GFX950-SDAG: ; %bb.0:
33+
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
34+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
35+
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
36+
; GFX950-SDAG-NEXT: s_nop 0
37+
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:32 nt
38+
; GFX950-SDAG-NEXT: s_endpgm
39+
;
40+
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr:
41+
; GFX950-GISEL: ; %bb.0:
42+
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
43+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
44+
; GFX950-GISEL-NEXT: s_nop 3
45+
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v0, s[0:1] offset:32 nt
46+
; GFX950-GISEL-NEXT: s_endpgm
47+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 32, i32 2)
48+
ret void
49+
}
50+
51+
define amdgpu_ps void @global_load_lds_dwordx3_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
52+
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr_and_vaddr:
53+
; GFX950-SDAG: ; %bb.0:
54+
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
55+
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
56+
; GFX950-SDAG-NEXT: s_nop 0
57+
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1
58+
; GFX950-SDAG-NEXT: s_endpgm
59+
;
60+
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr_and_vaddr:
61+
; GFX950-GISEL: ; %bb.0:
62+
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
63+
; GFX950-GISEL-NEXT: s_nop 4
64+
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1
65+
; GFX950-GISEL-NEXT: s_endpgm
66+
%voffset.64 = zext i32 %voffset to i64
67+
%gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
68+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 12, i32 48, i32 16)
69+
ret void
70+
}
71+
72+
;---------------------------------------------------------------------
73+
; dwordx4
74+
;---------------------------------------------------------------------
75+
76+
define amdgpu_ps void @global_load_lds_dwordx4_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
77+
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_vaddr:
78+
; GFX950-SDAG: ; %bb.0:
79+
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s0, v2
80+
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
81+
; GFX950-SDAG-NEXT: s_nop 0
82+
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
83+
; GFX950-SDAG-NEXT: s_endpgm
84+
;
85+
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_vaddr:
86+
; GFX950-GISEL: ; %bb.0:
87+
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v2
88+
; GFX950-GISEL-NEXT: s_nop 4
89+
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
90+
; GFX950-GISEL-NEXT: s_endpgm
91+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
92+
ret void
93+
}
94+
95+
define amdgpu_ps void @global_load_lds_dwordx4_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) {
96+
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr:
97+
; GFX950-SDAG: ; %bb.0:
98+
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
99+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
100+
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
101+
; GFX950-SDAG-NEXT: s_nop 0
102+
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:32 nt
103+
; GFX950-SDAG-NEXT: s_endpgm
104+
;
105+
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr:
106+
; GFX950-GISEL: ; %bb.0:
107+
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
108+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
109+
; GFX950-GISEL-NEXT: s_nop 3
110+
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v0, s[0:1] offset:32 nt
111+
; GFX950-GISEL-NEXT: s_endpgm
112+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 32, i32 2)
113+
ret void
114+
}
115+
116+
define amdgpu_ps void @global_load_lds_dwordx4_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
117+
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr_and_vaddr:
118+
; GFX950-SDAG: ; %bb.0:
119+
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
120+
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
121+
; GFX950-SDAG-NEXT: s_nop 0
122+
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1
123+
; GFX950-SDAG-NEXT: s_endpgm
124+
;
125+
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr_and_vaddr:
126+
; GFX950-GISEL: ; %bb.0:
127+
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
128+
; GFX950-GISEL-NEXT: s_nop 4
129+
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1
130+
; GFX950-GISEL-NEXT: s_endpgm
131+
%voffset.64 = zext i32 %voffset to i64
132+
%gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
133+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 16, i32 48, i32 16)
134+
ret void
135+
}
136+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
137+
; GFX950: {{.*}}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 --strict-whitespace %s
2+
// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX940 --implicit-check-not=error: %s
3+
// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX90A --implicit-check-not=error: %s
4+
// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX10 --implicit-check-not=error: %s
5+
6+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU
7+
// GFX950: global_load_lds_dwordx3 v[2:3], off ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
8+
9+
global_load_lds_dwordx3 v[2:3], off
10+
11+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
12+
// GFX950: global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00]
13+
global_load_lds_dwordx3 v[2:3], off sc0 nt sc1
14+
15+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
16+
// GFX950: global_load_lds_dwordx3 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
17+
global_load_lds_dwordx3 v[2:3], off offset:4
18+
19+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
20+
// GFX950: global_load_lds_dwordx3 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00]
21+
global_load_lds_dwordx3 v2, s[4:5] offset:4
22+
23+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU
24+
// GFX950: global_load_lds_dwordx4 v[2:3], off ; encoding: [0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
25+
global_load_lds_dwordx4 v[2:3], off
26+
27+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
28+
// GFX950: global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00]
29+
global_load_lds_dwordx4 v[2:3], off sc0 nt sc1
30+
31+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
32+
// GFX950: global_load_lds_dwordx4 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
33+
global_load_lds_dwordx4 v[2:3], off offset:4
34+
35+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
36+
// GFX950: global_load_lds_dwordx4 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00]
37+
global_load_lds_dwordx4 v2, s[4:5] offset:4
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding -disassemble %s | FileCheck -check-prefix=GFX950 %s
2+
3+
# GFX950: global_load_lds_dwordx3 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00]
4+
0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00
5+
6+
# GFX950: global_load_lds_dwordx3 v[2:3], off ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
7+
0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00
8+
9+
# GFX950: global_load_lds_dwordx3 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
10+
0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00
11+
12+
# GFX950: global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00]
13+
0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00
14+
15+
# GFX950: global_load_lds_dwordx4 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00]
16+
0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00
17+
18+
# GFX950: global_load_lds_dwordx4 v[2:3], off ; encoding: [0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
19+
0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00
20+
21+
# GFX950: global_load_lds_dwordx4 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
22+
0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00
23+
24+
# GFX950: global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00]
25+
0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00

0 commit comments

Comments
 (0)