Skip to content

[AMDGPU] Extend llvm.amdgcn.set.inactive intrinsic to support Reg32/Reg64 types #94457

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2356,7 +2356,7 @@ def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
// program ever uses WQM, then the instruction and the first source will be
// computed in WQM.
def int_amdgcn_set_inactive :
Intrinsic<[llvm_anyint_ty],
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, // value to be copied
LLVMMatchType<0>], // value for the inactive lanes to take
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
Expand Down
18 changes: 12 additions & 6 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -252,16 +252,22 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
// restoring it after we're done.
let Defs = [SCC], isConvergent = 1 in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
(ins VSrc_b32: $src, VSrc_b32:$inactive),
[(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
}
(ins VSrc_b32: $src, VSrc_b32:$inactive), []>;

def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64: $src, VSrc_b64:$inactive),
[(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
}
(ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
} // End Defs = [SCC]

foreach vt = Reg32Types.types in {
def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
(V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
}

foreach vt = Reg64Types.types in {
def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
(V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
}

def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ class RegisterTypes<list<ValueType> reg_types> {

def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, v4i16, v4f16, v4bf16, p0]>;

let HasVGPR = 1 in {
// VOP3 and VINTERP can access 256 lo and 256 hi registers.
Expand Down
320 changes: 320 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,326 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
ret void
}

define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
store float %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
store double %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
store <2 x i16> %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

v4f16 is the other interesting case, plus bfloat

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added tests for v4i16, v4f16, v4bf16, v2bf16

; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
store <2 x half> %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
; GCN-LABEL: set_inactive_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s4, 1
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
store <2 x i32> %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; GCN-LABEL: set_inactive_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s4, 1.0
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
store <2 x float> %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
store <2 x bfloat> %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
; GCN-LABEL: set_inactive_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s4, 0x10001
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
store <4 x i16> %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; GCN-LABEL: set_inactive_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
store <4 x half> %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
; GCN-LABEL: set_inactive_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s4, 0x3f803f80
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
store <4 x bfloat> %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
store ptr %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
ret void
}

declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
Expand Down
Loading
Loading