Skip to content

Commit db1cdae

Browse files
AMDGPU/GlobalISel: RegBankLegalize
Lower G_ instructions that can't be inst-selected with register bank assignment from StandaloneRegBankSelect based on uniformity analysis. - Lower instruction to perform it on assigned register bank - Put uniform value in vgpr because SALU instruction is not available - Execute divergent instruction in SALU - "waterfall loop" Given LLTs on all operands after legalizer, some register bank assignments require lowering while other do not. Note: cases where all register bank assignments would require lowering are lowered in legalizer. RegBankLegalize goals: - Define Rules: when and how to perform lowering - Goal of defining Rules it to provide high level table-like brief overview of how to lower generic instructions based on available target features and uniformity info (uniform vs divergent). - Fast search of Rules, depends on how complicated Rule.Predicate is - For some opcodes there would be too many Rules that are essentially all the same just for different combinations of types and banks. Write custom function that handles all cases. - Rules are made from enum IDs that correspond to each operand. Names of IDs are meant to give brief description what lowering does for each operand or the whole instruction. - RegBankLegalizeHelper implements lowering algorithms and handles all IDs Since this is the first patch that actually enables -new-reg-bank-select here is the summary of regression tests that were added earlier: - if instruction is uniform always select SALU instruction if available - eliminate back to back vgpr to sgpr to vgpr copies of uniform values - fast rules: small differences for standard and vector instruction - enabling Rule based on target feature - salu_float - how to specify lowering algorithm - vgpr S64 AND to S32 - on G_TRUNC in reg, it is up to user to deal with truncated bits G_TRUNC in reg is treated as no-op. - dealing with truncated high bits - ABS S16 to S32 - sgpr S1 phi lowering - new opcodes for vcc-to-scc and scc-to-vcc copies - lowering for vgprS1-to-vcc copy (formally this is vgpr-to-vcc G_TRUNC) - S1 zext and sext lowering to select - uniform and divergent S1 AND(OR and XOR) lowering - inst-selected into SALU instruction - divergent phi with uniform inputs - divergent instruction with temporal divergent use, source instruction is defined as uniform(StandaloneRegBankSelect) - missing temporal divergence lowering - uniform phi, because of undef incoming, is assigned to vgpr. Will be fixed in StandaloneRegBankSelect via another fix in machine uniformity analysis.
1 parent df50c85 commit db1cdae

17 files changed

+2077
-258
lines changed

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,14 @@ class MachineRegisterInfo {
674674
return dyn_cast_if_present<const TargetRegisterClass *>(Val);
675675
}
676676

677+
/// Return the register bank of \p Reg.
678+
/// This shouldn't be used directly unless \p Reg has a register bank.
679+
const RegisterBank *getRegBank(Register Reg) const {
680+
assert(isa<const RegisterBank *>(VRegInfo[Reg.id()].first) &&
681+
"Register bank not set, wrong accessor");
682+
return cast<const RegisterBank *>(VRegInfo[Reg.id()].first);
683+
}
684+
677685
/// Return the register bank of \p Reg, or null if Reg has not been assigned
678686
/// a register bank or has been assigned a register class.
679687
/// \note It is possible to get the register bank from the register class via

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,120 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
107107
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
108108
}
109109
}
110+
111+
MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B,
112+
const DstOp &SgprDst,
113+
const SrcOp &VgprSrc,
114+
const RegisterBankInfo &RBI) {
115+
auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc});
116+
Register Dst = RFL->getOperand(0).getReg();
117+
Register Src = RFL->getOperand(1).getReg();
118+
MachineRegisterInfo &MRI = *B.getMRI();
119+
if (!MRI.getRegBankOrNull(Dst))
120+
MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID));
121+
if (!MRI.getRegBankOrNull(Src))
122+
MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID));
123+
return RFL;
124+
}
125+
126+
MachineInstrBuilder
127+
AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst,
128+
const SrcOp &VgprSrc, LLT B32Ty,
129+
const RegisterBankInfo &RBI) {
130+
MachineRegisterInfo &MRI = *B.getMRI();
131+
SmallVector<Register, 8> SgprDstParts;
132+
auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc);
133+
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
134+
SgprDstParts.push_back(
135+
buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0));
136+
}
137+
138+
auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
139+
MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID));
140+
return Merge;
141+
}
142+
143+
MachineInstrBuilder
144+
AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst,
145+
const SrcOp &VgprSrc,
146+
const RegisterBankInfo &RBI) {
147+
LLT S32 = LLT::scalar(32);
148+
LLT S64 = LLT::scalar(64);
149+
MachineRegisterInfo &MRI = *B.getMRI();
150+
SmallVector<Register, 8> SgprDstParts;
151+
auto Unmerge = B.buildUnmerge(S64, VgprSrc);
152+
153+
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
154+
MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID));
155+
auto Unmerge64 = B.buildUnmerge(S32, Unmerge.getReg(i));
156+
SmallVector<Register, 2> Unmerge64Parts;
157+
Unmerge64Parts.push_back(
158+
buildReadAnyLaneB32(B, S32, Unmerge64.getReg(0), RBI).getReg(0));
159+
Unmerge64Parts.push_back(
160+
buildReadAnyLaneB32(B, S32, Unmerge64.getReg(1), RBI).getReg(0));
161+
Register MergeReg = B.buildMergeLikeInstr(S64, Unmerge64Parts).getReg(0);
162+
MRI.setRegBank(MergeReg, RBI.getRegBank(AMDGPU::SGPRRegBankID));
163+
SgprDstParts.push_back(MergeReg);
164+
}
165+
166+
auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
167+
MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID));
168+
return Merge;
169+
}
170+
171+
MachineInstrBuilder AMDGPU::buildReadAnyLane(MachineIRBuilder &B,
172+
const DstOp &SgprDst,
173+
const SrcOp &VgprSrc,
174+
const RegisterBankInfo &RBI) {
175+
MachineRegisterInfo &MRI = *B.getMRI();
176+
LLT S16 = LLT::scalar(16);
177+
LLT S32 = LLT::scalar(32);
178+
LLT S64 = LLT::scalar(64);
179+
LLT S256 = LLT::scalar(256);
180+
LLT V2S16 = LLT::fixed_vector(2, 16);
181+
LLT Ty = SgprDst.getLLTTy(MRI);
182+
183+
if (Ty == S16) {
184+
return B.buildTrunc(
185+
SgprDst, buildReadAnyLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc), RBI));
186+
}
187+
188+
if (Ty == S32 || Ty == V2S16 ||
189+
(Ty.isPointer() && Ty.getSizeInBits() == 32)) {
190+
return buildReadAnyLaneB32(B, SgprDst, VgprSrc, RBI);
191+
}
192+
193+
if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) ||
194+
(Ty.isVector() && Ty.getElementType() == S32)) {
195+
return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI);
196+
}
197+
198+
if (Ty.isVector() && Ty.getElementType() == S16) {
199+
return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI);
200+
}
201+
202+
if (Ty.isVector() && Ty.getElementType() == S64) {
203+
return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI);
204+
}
205+
206+
llvm_unreachable("Type not supported");
207+
}
208+
209+
void AMDGPU::buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI,
210+
const RegisterBankInfo &RBI) {
211+
MachineRegisterInfo &MRI = *B.getMRI();
212+
Register Dst = MI.getOperand(0).getReg();
213+
const RegisterBank *DstBank = MRI.getRegBankOrNull(Dst);
214+
if (DstBank != &RBI.getRegBank(AMDGPU::SGPRRegBankID))
215+
return;
216+
217+
Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
218+
MRI.setRegBank(VgprDst, RBI.getRegBank(AMDGPU::VGPRRegBankID));
219+
220+
MI.getOperand(0).setReg(VgprDst);
221+
MachineBasicBlock *MBB = MI.getParent();
222+
B.setInsertPt(*MBB, std::next(MI.getIterator()));
223+
// readAnyLane VgprDst into Dst after MI.
224+
buildReadAnyLane(B, Dst, VgprDst, RBI);
225+
return;
226+
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1010
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1111

12+
#include "AMDGPURegisterBankInfo.h"
13+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1214
#include "llvm/ADT/DenseSet.h"
15+
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
16+
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
1317
#include "llvm/CodeGen/MachineFunction.h"
1418
#include "llvm/CodeGen/Register.h"
1519
#include <utility>
@@ -48,7 +52,36 @@ class IntrinsicLaneMaskAnalyzer {
4852
// This will not be needed when we turn off LCSSA for global-isel.
4953
void findLCSSAPhi(Register Reg);
5054
};
51-
}
52-
}
55+
56+
void buildReadAnyLaneS1(MachineIRBuilder &B, MachineInstr &MI,
57+
const RegisterBankInfo &RBI);
58+
59+
MachineInstrBuilder buildReadAnyLaneB32(MachineIRBuilder &B,
60+
const DstOp &SgprDst,
61+
const SrcOp &VgprSrc,
62+
const RegisterBankInfo &RBI);
63+
64+
MachineInstrBuilder buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B,
65+
const DstOp &SgprDst,
66+
const SrcOp &VgprSrc,
67+
LLT B32Ty,
68+
const RegisterBankInfo &RBI);
69+
70+
MachineInstrBuilder buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B,
71+
const DstOp &SgprDst,
72+
const SrcOp &VgprSrc,
73+
const RegisterBankInfo &RBI);
74+
75+
MachineInstrBuilder buildReadAnyLane(MachineIRBuilder &B, const DstOp &SgprDst,
76+
const SrcOp &VgprSrc,
77+
const RegisterBankInfo &RBI);
78+
79+
// Create new vgpr destination register for MI then move it to current
80+
// MI's sgpr destination using one or more G_READANYLANE instructions.
81+
void buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI,
82+
const RegisterBankInfo &RBI);
83+
84+
} // namespace AMDGPU
85+
} // namespace llvm
5386

5487
#endif

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
217217
return true;
218218
}
219219

220+
bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
221+
const DebugLoc &DL = I.getDebugLoc();
222+
MachineBasicBlock *BB = I.getParent();
223+
224+
unsigned CmpOpc =
225+
STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
226+
MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
227+
.addReg(I.getOperand(1).getReg())
228+
.addImm(0);
229+
if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
230+
return false;
231+
232+
Register DstReg = I.getOperand(0).getReg();
233+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
234+
235+
I.eraseFromParent();
236+
return RBI.constrainGenericRegister(DstReg, AMDGPU::SGPR_32RegClass, *MRI);
237+
}
238+
239+
bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
240+
const DebugLoc &DL = I.getDebugLoc();
241+
MachineBasicBlock *BB = I.getParent();
242+
243+
Register DstReg = I.getOperand(0).getReg();
244+
Register SrcReg = I.getOperand(1).getReg();
245+
std::optional<ValueAndVReg> Arg =
246+
getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
247+
248+
if (Arg) {
249+
const int64_t Value = Arg->Value.getZExtValue();
250+
if (Value == 0) {
251+
unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
252+
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
253+
} else {
254+
assert(Value == 1);
255+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
256+
}
257+
I.eraseFromParent();
258+
return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
259+
}
260+
261+
// RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
262+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
263+
264+
unsigned SelectOpcode =
265+
STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
266+
MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
267+
.addReg(TRI.getExec())
268+
.addImm(0);
269+
270+
I.eraseFromParent();
271+
return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
272+
}
273+
274+
bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
275+
Register DstReg = I.getOperand(0).getReg();
276+
Register SrcReg = I.getOperand(1).getReg();
277+
278+
const DebugLoc &DL = I.getDebugLoc();
279+
MachineBasicBlock *BB = I.getParent();
280+
281+
auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
282+
.addReg(SrcReg);
283+
284+
I.eraseFromParent();
285+
return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
286+
}
287+
220288
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
221289
const Register DefReg = I.getOperand(0).getReg();
222290
const LLT DefTy = MRI->getType(DefReg);
@@ -249,7 +317,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
249317
}
250318
}
251319

252-
// TODO: Verify that all registers have the same bank
320+
// If inputs have register bank, assign corresponding reg class.
321+
// Note: registers don't need to have the same reg bank.
322+
for (unsigned i = 1; i < I.getNumOperands(); i += 2) {
323+
const Register SrcReg = I.getOperand(i).getReg();
324+
325+
const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
326+
if (RB) {
327+
const LLT SrcTy = MRI->getType(SrcReg);
328+
const TargetRegisterClass *SrcRC =
329+
TRI.getRegClassForTypeOnBank(SrcTy, *RB);
330+
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
331+
return false;
332+
}
333+
}
334+
253335
I.setDesc(TII.get(TargetOpcode::PHI));
254336
return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
255337
}
@@ -3656,6 +3738,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
36563738
return selectStackRestore(I);
36573739
case AMDGPU::G_PHI:
36583740
return selectPHI(I);
3741+
case AMDGPU::G_COPY_SCC_VCC:
3742+
return selectCOPY_SCC_VCC(I);
3743+
case AMDGPU::G_COPY_VCC_SCC:
3744+
return selectCOPY_VCC_SCC(I);
3745+
case AMDGPU::G_READANYLANE:
3746+
return selectReadAnyLane(I);
36593747
case TargetOpcode::G_CONSTANT:
36603748
case TargetOpcode::G_FCONSTANT:
36613749
default:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
8787

8888
bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const;
8989
bool selectCOPY(MachineInstr &I) const;
90+
bool selectCOPY_SCC_VCC(MachineInstr &I) const;
91+
bool selectCOPY_VCC_SCC(MachineInstr &I) const;
92+
bool selectReadAnyLane(MachineInstr &I) const;
9093
bool selectPHI(MachineInstr &I) const;
9194
bool selectG_TRUNC(MachineInstr &I) const;
9295
bool selectG_SZA_EXT(MachineInstr &I) const;

0 commit comments

Comments
 (0)