Skip to content

Commit 921a702

Browse files
AMDGPU/GlobalISel: RBLegalize
Lower G_ instructions that can't be inst-selected with register bank assignment from RBSelect based on uniformity analysis. - Lower instruction to perform it on assigned register bank - Put uniform value in vgpr because SALU instruction is not available - Execute divergent instruction in SALU - "waterfall loop" Given LLTs on all operands after legalizer, some register bank assignments require lowering while other do not. Note: cases where all register bank assignments would require lowering are lowered in legalizer. RBLegalize goals: - Define Rules: when and how to perform lowering - Goal of defining Rules it to provide high level table-like brief overview of how to lower generic instructions based on available target features and uniformity info (uniform vs divergent). - Fast search of Rules, depends on how complicated Rule.Predicate is - For some opcodes there would be too many Rules that are essentially all the same just for different combinations of types and banks. Write custom function that handles all cases. - Rules are made from enum IDs that correspond to each operand. Names of IDs are meant to give brief description what lowering does for each operand or the whole instruction. - RBLegalizeHelper implements lowering algorithms and handles all IDs Since this is the first patch that actually enables -new-reg-bank-select here is the summary of regression tests that were added earlier: - if instruction is uniform always select SALU instruction if available - eliminate back to back vgpr to sgpr to vgpr copies of uniform values - fast rules: small differences for standard and vector instruction - enabling Rule based on target feature - salu_float - how to specify lowering algorithm - vgpr S64 AND to S32 - on G_TRUNC in reg, it is up to user to deal with truncated bits G_TRUNC in reg is treated as no-op. - dealing with truncated high bits - ABS S16 to S32 - sgpr S1 phi lowering - new opcodes for vcc-to-scc and scc-to-vcc copies - lowering for vgprS1-to-vcc copy (formally this is vgpr-to-vcc G_TRUNC) - S1 zext and sext lowering to select - uniform and divergent S1 AND(OR and XOR) lowering - inst-selected into SALU instruction - divergent phi with uniform inputs - divergent instruction with temporal divergent use, source instruction is defined as uniform(RBSelect) - missing temporal divergence lowering - uniform phi, because of undef incoming, is assigned to vgpr. Will be fixed in RBSelect via another fix in machine uniformity analysis.
1 parent 2124eb3 commit 921a702

16 files changed

+2089
-253
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,183 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
107107
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
108108
}
109109
}
110+
111+
MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B,
112+
const DstOp &SgprDst,
113+
const SrcOp &VgprSrc,
114+
const RegisterBankInfo &RBI) {
115+
auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc});
116+
Register Dst = RFL->getOperand(0).getReg();
117+
Register Src = RFL->getOperand(1).getReg();
118+
MachineRegisterInfo &MRI = *B.getMRI();
119+
if (!MRI.getRegBankOrNull(Dst))
120+
MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID));
121+
if (!MRI.getRegBankOrNull(Src))
122+
MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID));
123+
return RFL;
124+
}
125+
126+
MachineInstrBuilder
127+
AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst,
128+
const SrcOp &VgprSrc, LLT B32Ty,
129+
const RegisterBankInfo &RBI) {
130+
MachineRegisterInfo &MRI = *B.getMRI();
131+
SmallVector<Register, 8> SgprDstParts;
132+
auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc);
133+
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
134+
SgprDstParts.push_back(
135+
buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0));
136+
}
137+
138+
auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
139+
MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID));
140+
return Merge;
141+
}
142+
143+
MachineInstrBuilder
144+
AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst,
145+
const SrcOp &VgprSrc,
146+
const RegisterBankInfo &RBI) {
147+
LLT S32 = LLT::scalar(32);
148+
LLT S64 = LLT::scalar(64);
149+
MachineRegisterInfo &MRI = *B.getMRI();
150+
SmallVector<Register, 8> SgprDstParts;
151+
auto Unmerge = B.buildUnmerge(S64, VgprSrc);
152+
153+
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
154+
MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID));
155+
auto Unmerge64 = B.buildUnmerge(S32, Unmerge.getReg(i));
156+
SmallVector<Register, 2> Unmerge64Parts;
157+
Unmerge64Parts.push_back(
158+
buildReadAnyLaneB32(B, S32, Unmerge64.getReg(0), RBI).getReg(0));
159+
Unmerge64Parts.push_back(
160+
buildReadAnyLaneB32(B, S32, Unmerge64.getReg(1), RBI).getReg(0));
161+
Register MergeReg = B.buildMergeLikeInstr(S64, Unmerge64Parts).getReg(0);
162+
MRI.setRegBank(MergeReg, RBI.getRegBank(AMDGPU::SGPRRegBankID));
163+
SgprDstParts.push_back(MergeReg);
164+
}
165+
166+
auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
167+
MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID));
168+
return Merge;
169+
}
170+
171+
MachineInstrBuilder AMDGPU::buildReadAnyLane(MachineIRBuilder &B,
172+
const DstOp &SgprDst,
173+
const SrcOp &VgprSrc,
174+
const RegisterBankInfo &RBI) {
175+
MachineRegisterInfo &MRI = *B.getMRI();
176+
LLT S16 = LLT::scalar(16);
177+
LLT S32 = LLT::scalar(32);
178+
LLT S64 = LLT::scalar(64);
179+
LLT S256 = LLT::scalar(256);
180+
LLT V2S16 = LLT::fixed_vector(2, 16);
181+
LLT Ty = SgprDst.getLLTTy(MRI);
182+
183+
if (Ty == S16) {
184+
return B.buildTrunc(
185+
SgprDst, buildReadAnyLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc), RBI));
186+
}
187+
188+
if (Ty == S32 || Ty == V2S16 ||
189+
(Ty.isPointer() && Ty.getSizeInBits() == 32)) {
190+
return buildReadAnyLaneB32(B, SgprDst, VgprSrc, RBI);
191+
}
192+
193+
if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) ||
194+
(Ty.isVector() && Ty.getElementType() == S32)) {
195+
return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI);
196+
}
197+
198+
if (Ty.isVector() && Ty.getElementType() == S16) {
199+
return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI);
200+
}
201+
202+
if (Ty.isVector() && Ty.getElementType() == S64) {
203+
return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI);
204+
}
205+
206+
llvm_unreachable("Type not supported");
207+
}
208+
209+
void AMDGPU::buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI,
210+
const RegisterBankInfo &RBI) {
211+
MachineRegisterInfo &MRI = *B.getMRI();
212+
Register Dst = MI.getOperand(0).getReg();
213+
const RegisterBank *DstBank = MRI.getRegBankOrNull(Dst);
214+
if (DstBank != &RBI.getRegBank(AMDGPU::SGPRRegBankID))
215+
return;
216+
217+
Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
218+
MRI.setRegBank(VgprDst, RBI.getRegBank(AMDGPU::VGPRRegBankID));
219+
220+
MI.getOperand(0).setReg(VgprDst);
221+
MachineBasicBlock *MBB = MI.getParent();
222+
B.setInsertPt(*MBB, std::next(MI.getIterator()));
223+
// readAnyLane VgprDst into Dst after MI.
224+
buildReadAnyLane(B, Dst, VgprDst, RBI);
225+
return;
226+
}
227+
228+
bool AMDGPU::isLaneMask(Register Reg, MachineRegisterInfo &MRI,
229+
const SIRegisterInfo *TRI) {
230+
const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
231+
if (RB && RB->getID() == VCCRegBankID)
232+
return true;
233+
234+
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
235+
if (RC && TRI->isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1))
236+
return true;
237+
238+
return false;
239+
}
240+
241+
bool AMDGPU::isSgprRB(Register Reg, MachineRegisterInfo &MRI) {
242+
const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
243+
if (RB && RB->getID() == SGPRRegBankID)
244+
return true;
245+
246+
return false;
247+
}
248+
249+
bool AMDGPU::isVgprRB(Register Reg, MachineRegisterInfo &MRI) {
250+
const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
251+
if (RB && RB->getID() == VGPRRegBankID)
252+
return true;
253+
254+
return false;
255+
}
256+
257+
void AMDGPU::cleanUpAfterCombine(MachineInstr &MI, MachineRegisterInfo &MRI,
258+
MachineInstr *Optional0) {
259+
MI.eraseFromParent();
260+
if (Optional0 && isTriviallyDead(*Optional0, MRI))
261+
Optional0->eraseFromParent();
262+
}
263+
264+
bool AMDGPU::hasSGPRS1(MachineFunction &MF, MachineRegisterInfo &MRI) {
265+
for (auto &MBB : MF) {
266+
for (auto &MI : make_early_inc_range(MBB)) {
267+
for (MachineOperand &Op : MI.operands()) {
268+
if (!Op.isReg())
269+
continue;
270+
271+
Register Reg = Op.getReg();
272+
if (!Reg.isVirtual())
273+
continue;
274+
275+
if (!isSgprRB(Reg, MRI) || MRI.getType(Reg) != LLT::scalar(1))
276+
continue;
277+
278+
MI.getParent()->dump();
279+
MI.dump();
280+
return true;
281+
}
282+
}
283+
}
284+
return false;
285+
}
286+
287+
bool AMDGPU::isS1(Register Reg, MachineRegisterInfo &MRI) {
288+
return MRI.getType(Reg) == LLT::scalar(1);
289+
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1010
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1111

12+
#include "AMDGPURegisterBankInfo.h"
13+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1214
#include "llvm/ADT/DenseSet.h"
15+
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
16+
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
1317
#include "llvm/CodeGen/MachineFunction.h"
1418
#include "llvm/CodeGen/Register.h"
1519
#include <utility>
@@ -48,7 +52,58 @@ class IntrinsicLaneMaskAnalyzer {
4852
// This will not be needed when we turn of LCSSA for global-isel.
4953
void findLCSSAPhi(Register Reg);
5054
};
55+
56+
void buildReadAnyLaneS1(MachineIRBuilder &B, MachineInstr &MI,
57+
const RegisterBankInfo &RBI);
58+
59+
MachineInstrBuilder buildReadAnyLaneB32(MachineIRBuilder &B,
60+
const DstOp &SgprDst,
61+
const SrcOp &VgprSrc,
62+
const RegisterBankInfo &RBI);
63+
64+
MachineInstrBuilder buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B,
65+
const DstOp &SgprDst,
66+
const SrcOp &VgprSrc,
67+
LLT B32Ty,
68+
const RegisterBankInfo &RBI);
69+
70+
MachineInstrBuilder buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B,
71+
const DstOp &SgprDst,
72+
const SrcOp &VgprSrc,
73+
const RegisterBankInfo &RBI);
74+
75+
MachineInstrBuilder buildReadAnyLane(MachineIRBuilder &B, const DstOp &SgprDst,
76+
const SrcOp &VgprSrc,
77+
const RegisterBankInfo &RBI);
78+
79+
// Create new vgpr destination register for MI then move it to current
80+
// MI's sgpr destination using one or more G_READANYLANE instructions.
81+
void buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI,
82+
const RegisterBankInfo &RBI);
83+
84+
// Share with SIRegisterInfo::isUniformReg? This could make uniformity info give
85+
// same result in later passes.
86+
bool isLaneMask(Register Reg, MachineRegisterInfo &MRI,
87+
const SIRegisterInfo *TRI);
88+
89+
bool isSgprRB(Register Reg, MachineRegisterInfo &MRI);
90+
91+
bool isVgprRB(Register Reg, MachineRegisterInfo &MRI);
92+
93+
template <typename SrcTy>
94+
inline MIPatternMatch::UnaryOp_match<SrcTy, AMDGPU::G_READANYLANE>
95+
m_GReadAnyLane(const SrcTy &Src) {
96+
return MIPatternMatch::UnaryOp_match<SrcTy, AMDGPU::G_READANYLANE>(Src);
5197
}
52-
}
98+
99+
void cleanUpAfterCombine(MachineInstr &MI, MachineRegisterInfo &MRI,
100+
MachineInstr *Optional0 = nullptr);
101+
102+
bool hasSGPRS1(MachineFunction &MF, MachineRegisterInfo &MRI);
103+
104+
bool isS1(Register Reg, MachineRegisterInfo &MRI);
105+
106+
} // namespace AMDGPU
107+
} // namespace llvm
53108

54109
#endif

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
217217
return true;
218218
}
219219

220+
bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
221+
const DebugLoc &DL = I.getDebugLoc();
222+
MachineBasicBlock *BB = I.getParent();
223+
224+
unsigned CmpOpc =
225+
STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
226+
MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
227+
.addReg(I.getOperand(1).getReg())
228+
.addImm(0);
229+
if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
230+
return false;
231+
232+
Register DstReg = I.getOperand(0).getReg();
233+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
234+
235+
I.eraseFromParent();
236+
return RBI.constrainGenericRegister(DstReg, AMDGPU::SGPR_32RegClass, *MRI);
237+
}
238+
239+
bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
240+
const DebugLoc &DL = I.getDebugLoc();
241+
MachineBasicBlock *BB = I.getParent();
242+
243+
Register DstReg = I.getOperand(0).getReg();
244+
Register SrcReg = I.getOperand(1).getReg();
245+
std::optional<ValueAndVReg> Arg =
246+
getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
247+
248+
if (Arg) {
249+
const int64_t Value = Arg->Value.getZExtValue();
250+
if (Value == 0) {
251+
unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
252+
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
253+
} else {
254+
assert(Value == 1);
255+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
256+
}
257+
I.eraseFromParent();
258+
return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
259+
}
260+
261+
// RBLegalize was ensures that SrcReg is bool in reg (high bits are 0).
262+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
263+
264+
unsigned SelectOpcode =
265+
STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
266+
MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
267+
.addReg(TRI.getExec())
268+
.addImm(0);
269+
270+
I.eraseFromParent();
271+
return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
272+
}
273+
274+
bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
275+
Register DstReg = I.getOperand(0).getReg();
276+
Register SrcReg = I.getOperand(1).getReg();
277+
278+
const DebugLoc &DL = I.getDebugLoc();
279+
MachineBasicBlock *BB = I.getParent();
280+
281+
auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
282+
.addReg(SrcReg);
283+
284+
I.eraseFromParent();
285+
return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
286+
}
287+
220288
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
221289
const Register DefReg = I.getOperand(0).getReg();
222290
const LLT DefTy = MRI->getType(DefReg);
@@ -249,7 +317,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
249317
}
250318
}
251319

252-
// TODO: Verify that all registers have the same bank
320+
// If inputs have register bank, assign corresponding reg class.
321+
// Note: registers don't need to have the same reg bank.
322+
for (unsigned i = 1; i < I.getNumOperands(); i += 2) {
323+
const Register SrcReg = I.getOperand(i).getReg();
324+
325+
const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
326+
if (RB) {
327+
const LLT SrcTy = MRI->getType(SrcReg);
328+
const TargetRegisterClass *SrcRC =
329+
TRI.getRegClassForTypeOnBank(SrcTy, *RB);
330+
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
331+
return false;
332+
}
333+
}
334+
253335
I.setDesc(TII.get(TargetOpcode::PHI));
254336
return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
255337
}
@@ -3656,6 +3738,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
36563738
return selectStackRestore(I);
36573739
case AMDGPU::G_PHI:
36583740
return selectPHI(I);
3741+
case AMDGPU::G_COPY_SCC_VCC:
3742+
return selectCOPY_SCC_VCC(I);
3743+
case AMDGPU::G_COPY_VCC_SCC:
3744+
return selectCOPY_VCC_SCC(I);
3745+
case AMDGPU::G_READANYLANE:
3746+
return selectReadAnyLane(I);
36593747
case TargetOpcode::G_CONSTANT:
36603748
case TargetOpcode::G_FCONSTANT:
36613749
default:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
8787

8888
bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const;
8989
bool selectCOPY(MachineInstr &I) const;
90+
bool selectCOPY_SCC_VCC(MachineInstr &I) const;
91+
bool selectCOPY_VCC_SCC(MachineInstr &I) const;
92+
bool selectReadAnyLane(MachineInstr &I) const;
9093
bool selectPHI(MachineInstr &I) const;
9194
bool selectG_TRUNC(MachineInstr &I) const;
9295
bool selectG_SZA_EXT(MachineInstr &I) const;

0 commit comments

Comments
 (0)