-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Extend llvm.amdgcn.set.inactive intrinsic to support Reg32/Reg64 types #94457
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Vikram Hegde (vikramRH) ChangesMissed this while handling other patches. Any comments/concerns ? Patch is 20.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94457.diff 4 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bfb966dfad02d..1ce6acfdd6dc1 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2356,7 +2356,7 @@ def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
// program ever uses WQM, then the instruction and the first source will be
// computed in WQM.
def int_amdgcn_set_inactive :
- Intrinsic<[llvm_anyint_ty],
+ Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, // value to be copied
LLVMMatchType<0>], // value for the inactive lanes to take
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1b844f844c32..418458f0171bd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -252,16 +252,22 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
// restoring it after we're done.
let Defs = [SCC], isConvergent = 1 in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins VSrc_b32: $src, VSrc_b32:$inactive),
- [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
-}
+ (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VSrc_b64: $src, VSrc_b64:$inactive),
- [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
-}
+ (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
} // End Defs = [SCC]
+foreach vt = Reg32Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+ (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
+}
+
+foreach vt = Reg64Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+ (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+}
+
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index cbee039df7fd0..e08b7cf38ebb5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -93,6 +93,234 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
ret void
}
+define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
+; GCN-LABEL: set_inactive_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ store float %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
+; GCN-LABEL: set_inactive_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
+; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ store double %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
+; GCN-LABEL: set_inactive_v2i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ store <2 x i16> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
+; GCN-LABEL: set_inactive_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ store <2 x half> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
+; GCN-LABEL: set_inactive_v2i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 1
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ store <2 x i32> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; GCN-LABEL: set_inactive_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 1.0
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ store <2 x float> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
+; GCN-LABEL: set_inactive_p0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ store ptr %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
+; GCN-LABEL: set_inactive_p2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ store ptr addrspace(2) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GCN-LABEL: set_inactive_p3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ store ptr addrspace(3) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
+; GCN-LABEL: set_inactive_p5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ store ptr addrspace(5) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
+; GCN-LABEL: set_inactive_p6:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ store ptr addrspace(6) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 8302af7450ed9..40d08f8692fb1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -124,6 +124,236 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
ret void
}
+define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
+; GCN-LABEL: set_inactive_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s5, 0x40400000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ store float %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
+; GCN-LABEL: set_inactive_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ store double %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
+; GCN-LABEL: set_inactive_v2i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s5, 0x10001
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ store <2 x i16> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
+; GCN-LABEL: set_inactive_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s5, 0x3c003c00
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ store <2 x half> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
+; GCN-LABEL: set_inactive_v2i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s8, 1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ store <2 x i32> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; GCN-LABEL: set_inactive_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s8, 1.0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ store <2 x float> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
+; GCN-LABEL: set_inactive_p0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ store ptr %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
+; GCN-LABEL: set_inactive_p2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ store ptr addrspace(2) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GCN-LABEL: set_inactive_p3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ store ptr addrspace(3) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
+; GCN-LABEL: set_inactive_p5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ store ptr addrspace(5) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
+; GCN-LABEL: set_inactive_p6:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm but I don't know if the graphics usage needs this
ret void | ||
} | ||
|
||
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
v4f16 is the other interesting case, plus bfloat
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added tests for v4i16, v4f16, v4bf16, v2bf16
required by atomic optimizer for now, not sure about other use cases though |
Gentle ping.. |
…eg64 types (llvm#94457) Missed this while handling other patches. Any comments/concerns ? Change-Id: I021d53131bca9e8e5cda37dc69bddd1dc23425b3
…eg64 types (llvm#94457) Missed this while handling other patches. Any comments/concerns ? Change-Id: Id96d774121edcaaef77c2a4b2e712c384ba8336b
Missed this while handling other patches. Any comments/concerns ?