Skip to content

Commit cf4c3d9

Browse files
authored
[AMDGPU] Extend llvm.amdgcn.set.inactive intrinsic to support Reg32/Reg64 types (#94457)
Missed this while handling other patches. Any comments/concerns ?
1 parent 0113f26 commit cf4c3d9

File tree

5 files changed

+656
-8
lines changed

5 files changed

+656
-8
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2252,7 +2252,7 @@ def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
22522252
// program ever uses WQM, then the instruction and the first source will be
22532253
// computed in WQM.
22542254
def int_amdgcn_set_inactive :
2255-
Intrinsic<[llvm_anyint_ty],
2255+
Intrinsic<[llvm_any_ty],
22562256
[LLVMMatchType<0>, // value to be copied
22572257
LLVMMatchType<0>], // value for the inactive lanes to take
22582258
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -237,16 +237,22 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
237237
// restoring it after we're done.
238238
let Defs = [SCC], isConvergent = 1 in {
239239
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
240-
(ins VSrc_b32: $src, VSrc_b32:$inactive),
241-
[(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
242-
}
240+
(ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
243241

244242
def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
245-
(ins VSrc_b64: $src, VSrc_b64:$inactive),
246-
[(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
247-
}
243+
(ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
248244
} // End Defs = [SCC]
249245

246+
foreach vt = Reg32Types.types in {
247+
def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
248+
(V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
249+
}
250+
251+
foreach vt = Reg64Types.types in {
252+
def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
253+
(V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
254+
}
255+
250256
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
251257
(V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
252258

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ class RegisterTypes<list<ValueType> reg_types> {
586586

587587
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
588588
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
589-
def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
589+
def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, v4i16, v4f16, v4bf16, p0]>;
590590

591591
let HasVGPR = 1 in {
592592
// VOP3 and VINTERP can access 256 lo and 256 hi registers.

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll

Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,326 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
9393
ret void
9494
}
9595

96+
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
97+
; GCN-LABEL: set_inactive_f32:
98+
; GCN: ; %bb.0:
99+
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
100+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
101+
; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
102+
; GCN-NEXT: s_mov_b32 s2, -1
103+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
104+
; GCN-NEXT: v_mov_b32_e32 v0, s3
105+
; GCN-NEXT: s_not_b64 exec, exec
106+
; GCN-NEXT: v_mov_b32_e32 v0, v1
107+
; GCN-NEXT: s_not_b64 exec, exec
108+
; GCN-NEXT: s_mov_b32 s3, 0xf000
109+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
110+
; GCN-NEXT: s_endpgm
111+
%tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
112+
store float %tmp, ptr addrspace(1) %out
113+
ret void
114+
}
115+
116+
define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
117+
; GCN-LABEL: set_inactive_f64:
118+
; GCN: ; %bb.0:
119+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
120+
; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
121+
; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
122+
; GCN-NEXT: v_mov_b32_e32 v2, s4
123+
; GCN-NEXT: v_mov_b32_e32 v3, s5
124+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
125+
; GCN-NEXT: v_mov_b32_e32 v0, s2
126+
; GCN-NEXT: v_mov_b32_e32 v1, s3
127+
; GCN-NEXT: s_not_b64 exec, exec
128+
; GCN-NEXT: v_mov_b32_e32 v0, v2
129+
; GCN-NEXT: v_mov_b32_e32 v1, v3
130+
; GCN-NEXT: s_not_b64 exec, exec
131+
; GCN-NEXT: s_mov_b32 s2, -1
132+
; GCN-NEXT: s_mov_b32 s3, 0xf000
133+
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
134+
; GCN-NEXT: s_endpgm
135+
%tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
136+
store double %tmp, ptr addrspace(1) %out
137+
ret void
138+
}
139+
140+
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
141+
; GCN-LABEL: set_inactive_v2i16:
142+
; GCN: ; %bb.0:
143+
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
144+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
145+
; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
146+
; GCN-NEXT: s_mov_b32 s2, -1
147+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
148+
; GCN-NEXT: v_mov_b32_e32 v0, s3
149+
; GCN-NEXT: s_not_b64 exec, exec
150+
; GCN-NEXT: v_mov_b32_e32 v0, v1
151+
; GCN-NEXT: s_not_b64 exec, exec
152+
; GCN-NEXT: s_mov_b32 s3, 0xf000
153+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
154+
; GCN-NEXT: s_endpgm
155+
%tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
156+
store <2 x i16> %tmp, ptr addrspace(1) %out
157+
ret void
158+
}
159+
160+
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
161+
; GCN-LABEL: set_inactive_v2f16:
162+
; GCN: ; %bb.0:
163+
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
164+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
165+
; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
166+
; GCN-NEXT: s_mov_b32 s2, -1
167+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
168+
; GCN-NEXT: v_mov_b32_e32 v0, s3
169+
; GCN-NEXT: s_not_b64 exec, exec
170+
; GCN-NEXT: v_mov_b32_e32 v0, v1
171+
; GCN-NEXT: s_not_b64 exec, exec
172+
; GCN-NEXT: s_mov_b32 s3, 0xf000
173+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
174+
; GCN-NEXT: s_endpgm
175+
%tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
176+
store <2 x half> %tmp, ptr addrspace(1) %out
177+
ret void
178+
}
179+
180+
define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
181+
; GCN-LABEL: set_inactive_v2i32:
182+
; GCN: ; %bb.0:
183+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
184+
; GCN-NEXT: s_mov_b32 s4, 1
185+
; GCN-NEXT: s_mov_b32 s5, s4
186+
; GCN-NEXT: v_mov_b32_e32 v2, s4
187+
; GCN-NEXT: v_mov_b32_e32 v3, s5
188+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
189+
; GCN-NEXT: v_mov_b32_e32 v0, s2
190+
; GCN-NEXT: v_mov_b32_e32 v1, s3
191+
; GCN-NEXT: s_not_b64 exec, exec
192+
; GCN-NEXT: v_mov_b32_e32 v0, v2
193+
; GCN-NEXT: v_mov_b32_e32 v1, v3
194+
; GCN-NEXT: s_not_b64 exec, exec
195+
; GCN-NEXT: s_mov_b32 s2, -1
196+
; GCN-NEXT: s_mov_b32 s3, 0xf000
197+
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
198+
; GCN-NEXT: s_endpgm
199+
%tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
200+
store <2 x i32> %tmp, ptr addrspace(1) %out
201+
ret void
202+
}
203+
204+
define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
205+
; GCN-LABEL: set_inactive_v2f32:
206+
; GCN: ; %bb.0:
207+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
208+
; GCN-NEXT: s_mov_b32 s4, 1.0
209+
; GCN-NEXT: s_mov_b32 s5, s4
210+
; GCN-NEXT: v_mov_b32_e32 v2, s4
211+
; GCN-NEXT: v_mov_b32_e32 v3, s5
212+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
213+
; GCN-NEXT: v_mov_b32_e32 v0, s2
214+
; GCN-NEXT: v_mov_b32_e32 v1, s3
215+
; GCN-NEXT: s_not_b64 exec, exec
216+
; GCN-NEXT: v_mov_b32_e32 v0, v2
217+
; GCN-NEXT: v_mov_b32_e32 v1, v3
218+
; GCN-NEXT: s_not_b64 exec, exec
219+
; GCN-NEXT: s_mov_b32 s2, -1
220+
; GCN-NEXT: s_mov_b32 s3, 0xf000
221+
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
222+
; GCN-NEXT: s_endpgm
223+
%tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
224+
store <2 x float> %tmp, ptr addrspace(1) %out
225+
ret void
226+
}
227+
228+
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
229+
; GCN-LABEL: set_inactive_v2bf16:
230+
; GCN: ; %bb.0:
231+
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
232+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
233+
; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
234+
; GCN-NEXT: s_mov_b32 s2, -1
235+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
236+
; GCN-NEXT: v_mov_b32_e32 v0, s3
237+
; GCN-NEXT: s_not_b64 exec, exec
238+
; GCN-NEXT: v_mov_b32_e32 v0, v1
239+
; GCN-NEXT: s_not_b64 exec, exec
240+
; GCN-NEXT: s_mov_b32 s3, 0xf000
241+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
242+
; GCN-NEXT: s_endpgm
243+
%tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
244+
store <2 x bfloat> %tmp, ptr addrspace(1) %out
245+
ret void
246+
}
247+
248+
define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
249+
; GCN-LABEL: set_inactive_v4i16:
250+
; GCN: ; %bb.0:
251+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
252+
; GCN-NEXT: s_mov_b32 s4, 0x10001
253+
; GCN-NEXT: s_mov_b32 s5, s4
254+
; GCN-NEXT: v_mov_b32_e32 v2, s4
255+
; GCN-NEXT: v_mov_b32_e32 v3, s5
256+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
257+
; GCN-NEXT: v_mov_b32_e32 v0, s2
258+
; GCN-NEXT: v_mov_b32_e32 v1, s3
259+
; GCN-NEXT: s_not_b64 exec, exec
260+
; GCN-NEXT: v_mov_b32_e32 v0, v2
261+
; GCN-NEXT: v_mov_b32_e32 v1, v3
262+
; GCN-NEXT: s_not_b64 exec, exec
263+
; GCN-NEXT: s_mov_b32 s2, -1
264+
; GCN-NEXT: s_mov_b32 s3, 0xf000
265+
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
266+
; GCN-NEXT: s_endpgm
267+
%tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
268+
store <4 x i16> %tmp, ptr addrspace(1) %out
269+
ret void
270+
}
271+
272+
define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
273+
; GCN-LABEL: set_inactive_v4f16:
274+
; GCN: ; %bb.0:
275+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
276+
; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
277+
; GCN-NEXT: s_mov_b32 s5, s4
278+
; GCN-NEXT: v_mov_b32_e32 v2, s4
279+
; GCN-NEXT: v_mov_b32_e32 v3, s5
280+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
281+
; GCN-NEXT: v_mov_b32_e32 v0, s2
282+
; GCN-NEXT: v_mov_b32_e32 v1, s3
283+
; GCN-NEXT: s_not_b64 exec, exec
284+
; GCN-NEXT: v_mov_b32_e32 v0, v2
285+
; GCN-NEXT: v_mov_b32_e32 v1, v3
286+
; GCN-NEXT: s_not_b64 exec, exec
287+
; GCN-NEXT: s_mov_b32 s2, -1
288+
; GCN-NEXT: s_mov_b32 s3, 0xf000
289+
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
290+
; GCN-NEXT: s_endpgm
291+
%tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
292+
store <4 x half> %tmp, ptr addrspace(1) %out
293+
ret void
294+
}
295+
296+
define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
297+
; GCN-LABEL: set_inactive_v4bf16:
298+
; GCN: ; %bb.0:
299+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
300+
; GCN-NEXT: s_mov_b32 s4, 0x3f803f80
301+
; GCN-NEXT: s_mov_b32 s5, s4
302+
; GCN-NEXT: v_mov_b32_e32 v2, s4
303+
; GCN-NEXT: v_mov_b32_e32 v3, s5
304+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
305+
; GCN-NEXT: v_mov_b32_e32 v0, s2
306+
; GCN-NEXT: v_mov_b32_e32 v1, s3
307+
; GCN-NEXT: s_not_b64 exec, exec
308+
; GCN-NEXT: v_mov_b32_e32 v0, v2
309+
; GCN-NEXT: v_mov_b32_e32 v1, v3
310+
; GCN-NEXT: s_not_b64 exec, exec
311+
; GCN-NEXT: s_mov_b32 s2, -1
312+
; GCN-NEXT: s_mov_b32 s3, 0xf000
313+
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
314+
; GCN-NEXT: s_endpgm
315+
%tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
316+
store <4 x bfloat> %tmp, ptr addrspace(1) %out
317+
ret void
318+
}
319+
320+
define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
321+
; GCN-LABEL: set_inactive_p0:
322+
; GCN: ; %bb.0:
323+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
324+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
325+
; GCN-NEXT: v_mov_b32_e32 v0, s2
326+
; GCN-NEXT: v_mov_b32_e32 v1, s3
327+
; GCN-NEXT: s_not_b64 exec, exec
328+
; GCN-NEXT: v_mov_b32_e32 v0, 0
329+
; GCN-NEXT: v_mov_b32_e32 v1, 0
330+
; GCN-NEXT: s_not_b64 exec, exec
331+
; GCN-NEXT: s_mov_b32 s2, -1
332+
; GCN-NEXT: s_mov_b32 s3, 0xf000
333+
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
334+
; GCN-NEXT: s_endpgm
335+
%tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
336+
store ptr %tmp, ptr addrspace(1) %out
337+
ret void
338+
}
339+
340+
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
341+
; GCN-LABEL: set_inactive_p2:
342+
; GCN: ; %bb.0:
343+
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
344+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
345+
; GCN-NEXT: s_mov_b32 s2, -1
346+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
347+
; GCN-NEXT: v_mov_b32_e32 v0, s3
348+
; GCN-NEXT: s_not_b64 exec, exec
349+
; GCN-NEXT: v_mov_b32_e32 v0, 0
350+
; GCN-NEXT: s_not_b64 exec, exec
351+
; GCN-NEXT: s_mov_b32 s3, 0xf000
352+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
353+
; GCN-NEXT: s_endpgm
354+
%tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
355+
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
356+
ret void
357+
}
358+
359+
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
360+
; GCN-LABEL: set_inactive_p3:
361+
; GCN: ; %bb.0:
362+
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
363+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
364+
; GCN-NEXT: s_mov_b32 s2, -1
365+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
366+
; GCN-NEXT: v_mov_b32_e32 v0, s3
367+
; GCN-NEXT: s_not_b64 exec, exec
368+
; GCN-NEXT: v_mov_b32_e32 v0, 0
369+
; GCN-NEXT: s_not_b64 exec, exec
370+
; GCN-NEXT: s_mov_b32 s3, 0xf000
371+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
372+
; GCN-NEXT: s_endpgm
373+
%tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
374+
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
375+
ret void
376+
}
377+
378+
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
379+
; GCN-LABEL: set_inactive_p5:
380+
; GCN: ; %bb.0:
381+
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
382+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
383+
; GCN-NEXT: s_mov_b32 s2, -1
384+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
385+
; GCN-NEXT: v_mov_b32_e32 v0, s3
386+
; GCN-NEXT: s_not_b64 exec, exec
387+
; GCN-NEXT: v_mov_b32_e32 v0, 0
388+
; GCN-NEXT: s_not_b64 exec, exec
389+
; GCN-NEXT: s_mov_b32 s3, 0xf000
390+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
391+
; GCN-NEXT: s_endpgm
392+
%tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
393+
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
394+
ret void
395+
}
396+
397+
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
398+
; GCN-LABEL: set_inactive_p6:
399+
; GCN: ; %bb.0:
400+
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
401+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
402+
; GCN-NEXT: s_mov_b32 s2, -1
403+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
404+
; GCN-NEXT: v_mov_b32_e32 v0, s3
405+
; GCN-NEXT: s_not_b64 exec, exec
406+
; GCN-NEXT: v_mov_b32_e32 v0, 0
407+
; GCN-NEXT: s_not_b64 exec, exec
408+
; GCN-NEXT: s_mov_b32 s3, 0xf000
409+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
410+
; GCN-NEXT: s_endpgm
411+
%tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
412+
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
413+
ret void
414+
}
415+
96416
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
97417
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
98418
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)

0 commit comments

Comments
 (0)