Skip to content

Commit 41cf94e

Browse files
authored
[AMDGPU] - Add s_quadmask intrinsics (llvm#70804)
Add intrinsics to generate `s_quadmask_b32` and `s_quadmask_b64`. Support VGPR arguments by inserting a `v_readfirstlane`.
1 parent 18839ae commit 41cf94e

File tree

5 files changed

+112
-4
lines changed

5 files changed

+112
-4
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1932,6 +1932,11 @@ def int_amdgcn_inverse_ballot :
19321932
def int_amdgcn_s_bitreplicate :
19331933
DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
19341934

1935+
// Lowers to S_QUADMASK_B{32,64}
1936+
// The argument must be uniform; otherwise, the result is undefined.
1937+
def int_amdgcn_s_quadmask :
1938+
DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
1939+
19351940
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
19361941
[data_ty],
19371942
[

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2995,6 +2995,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
29952995
return;
29962996
case Intrinsic::amdgcn_inverse_ballot:
29972997
case Intrinsic::amdgcn_s_bitreplicate:
2998+
case Intrinsic::amdgcn_s_quadmask:
29982999
applyDefaultMapping(OpdMapper);
29993000
constrainOpWithReadfirstlane(B, MI, 2); // Mask
30003001
return;
@@ -4537,6 +4538,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45374538
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
45384539
break;
45394540
}
4541+
case Intrinsic::amdgcn_s_quadmask: {
4542+
Register MaskReg = MI.getOperand(2).getReg();
4543+
unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4544+
unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4545+
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
4546+
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4547+
break;
4548+
}
45404549
case Intrinsic::amdgcn_wave_reduce_umin:
45414550
case Intrinsic::amdgcn_wave_reduce_umax: {
45424551
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6484,8 +6484,10 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
64846484
return CreatedBB;
64856485
}
64866486

6487-
// Legalize S_BITREPLICATE
6488-
if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) {
6487+
// Legalize S_BITREPLICATE and S_QUADMASK
6488+
if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6489+
MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6490+
MI.getOpcode() == AMDGPU::S_QUADMASK_B64) {
64896491
MachineOperand &Src = MI.getOperand(1);
64906492
if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
64916493
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,10 @@ def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">;
326326

327327
} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
328328

329-
def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32">;
330-
def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">;
329+
def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32",
330+
[(set i32:$sdst, (int_amdgcn_s_quadmask i32:$src0))]>;
331+
def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64",
332+
[(set i64:$sdst, (int_amdgcn_s_quadmask i64:$src0))]>;
331333

332334
let Uses = [M0] in {
333335
def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">;
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
3+
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
4+
5+
declare i32 @llvm.amdgcn.s.quadmask.i32(i32)
6+
declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
7+
8+
define i32 @test_quadmask_constant_i32() {
9+
; GFX11-LABEL: test_quadmask_constant_i32:
10+
; GFX11: ; %bb.0: ; %entry
11+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12+
; GFX11-NEXT: s_quadmask_b32 s0, 0x85fe3a92
13+
; GFX11-NEXT: v_mov_b32_e32 v0, s0
14+
; GFX11-NEXT: s_setpc_b64 s[30:31]
15+
entry:
16+
%qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 u0x85FE3A92)
17+
ret i32 %qm
18+
}
19+
20+
define amdgpu_cs void @test_quadmask_sgpr_i32(i32 inreg %mask, ptr addrspace(1) %out) {
21+
; GFX11-LABEL: test_quadmask_sgpr_i32:
22+
; GFX11: ; %bb.0: ; %entry
23+
; GFX11-NEXT: s_quadmask_b32 s0, s0
24+
; GFX11-NEXT: v_mov_b32_e32 v2, s0
25+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
26+
; GFX11-NEXT: s_nop 0
27+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
28+
; GFX11-NEXT: s_endpgm
29+
entry:
30+
%qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %mask)
31+
store i32 %qm, ptr addrspace(1) %out
32+
ret void
33+
}
34+
35+
36+
define i32 @test_quadmask_vgpr_i32(i32 %mask) {
37+
; GFX11-LABEL: test_quadmask_vgpr_i32:
38+
; GFX11: ; %bb.0: ; %entry
39+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40+
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
41+
; GFX11-NEXT: s_quadmask_b32 s0, s0
42+
; GFX11-NEXT: v_mov_b32_e32 v0, s0
43+
; GFX11-NEXT: s_setpc_b64 s[30:31]
44+
entry:
45+
%qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %mask)
46+
ret i32 %qm
47+
}
48+
49+
define i64 @test_quadmask_constant_i64() {
50+
; GFX11-LABEL: test_quadmask_constant_i64:
51+
; GFX11: ; %bb.0: ; %entry
52+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
53+
; GFX11-NEXT: s_mov_b32 s0, 0x85fe3a92
54+
; GFX11-NEXT: s_mov_b32 s1, 0x67de48fc
55+
; GFX11-NEXT: s_quadmask_b64 s[0:1], s[0:1]
56+
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
57+
; GFX11-NEXT: s_setpc_b64 s[30:31]
58+
entry:
59+
%qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 u0x67DE48FC85FE3A92)
60+
ret i64 %qm
61+
}
62+
63+
define amdgpu_cs void @test_quadmask_sgpr_i64(i64 inreg %mask, ptr addrspace(1) %out) {
64+
; GFX11-LABEL: test_quadmask_sgpr_i64:
65+
; GFX11: ; %bb.0: ; %entry
66+
; GFX11-NEXT: s_quadmask_b64 s[0:1], s[0:1]
67+
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
68+
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
69+
; GFX11-NEXT: s_nop 0
70+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
71+
; GFX11-NEXT: s_endpgm
72+
entry:
73+
%qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask)
74+
store i64 %qm, ptr addrspace(1) %out
75+
ret void
76+
}
77+
78+
define i64 @test_quadmask_vgpr_i64(i64 %mask) {
79+
; GFX11-LABEL: test_quadmask_vgpr_i64:
80+
; GFX11: ; %bb.0: ; %entry
81+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82+
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
83+
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
84+
; GFX11-NEXT: s_quadmask_b64 s[0:1], s[0:1]
85+
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
86+
; GFX11-NEXT: s_setpc_b64 s[30:31]
87+
entry:
88+
%qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask)
89+
ret i64 %qm
90+
}

0 commit comments

Comments
 (0)