Skip to content

Commit a93b3b6

Browse files
AMDGPU/GlobalISel: AMDGPURegBankLegalize
Lower G_ instructions that can't be inst-selected with register bank assignment from AMDGPURegBankSelect based on uniformity analysis. - Lower instruction to perform it on assigned register bank - Put uniform value in vgpr because SALU instruction is not available - Execute divergent instruction in SALU - "waterfall loop" Given LLTs on all operands after legalizer, some register bank assignments require lowering while other do not. Note: cases where all register bank assignments would require lowering are lowered in legalizer. AMDGPURegBankLegalize goals: - Define Rules: when and how to perform lowering - Goal of defining Rules it to provide high level table-like brief overview of how to lower generic instructions based on available target features and uniformity info (uniform vs divergent). - Fast search of Rules, depends on how complicated Rule.Predicate is - For some opcodes there would be too many Rules that are essentially all the same just for different combinations of types and banks. Write custom function that handles all cases. - Rules are made from enum IDs that correspond to each operand. Names of IDs are meant to give brief description what lowering does for each operand or the whole instruction. - AMDGPURegBankLegalizeHelper implements lowering algorithms Since this is the first patch that actually enables -new-reg-bank-select here is the summary of regression tests that were added earlier: - if instruction is uniform always select SALU instruction if available - eliminate back to back vgpr to sgpr to vgpr copies of uniform values - fast rules: small differences for standard and vector instruction - enabling Rule based on target feature - salu_float - how to specify lowering algorithm - vgpr S64 AND to S32 - on G_TRUNC in reg, it is up to user to deal with truncated bits G_TRUNC in reg is treated as no-op. - dealing with truncated high bits - ABS S16 to S32 - sgpr S1 phi lowering - new opcodes for vcc-to-scc and scc-to-vcc copies - lowering for vgprS1-to-vcc copy (formally this is vgpr-to-vcc G_TRUNC) - S1 zext and sext lowering to select - uniform and divergent S1 AND(OR and XOR) lowering - inst-selected into SALU instruction - divergent phi with uniform inputs - divergent instruction with temporal divergent use, source instruction is defined as uniform(AMDGPURegBankSelect) - missing temporal divergence lowering - uniform phi, because of undef incoming, is assigned to vgpr. Will be fixed in AMDGPURegBankSelect via another fix in machine uniformity analysis.
1 parent c19466b commit a93b3b6

19 files changed

+1958
-255
lines changed

llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,6 +1073,11 @@ class MachineIRBuilder {
10731073
/// Build and insert an unmerge of \p Res sized pieces to cover \p Op
10741074
MachineInstrBuilder buildUnmerge(LLT Res, const SrcOp &Op);
10751075

1076+
/// Build and insert an unmerge of pieces with \p Attrs register attributes to
1077+
/// cover \p Op
1078+
MachineInstrBuilder buildUnmerge(MachineRegisterInfo::VRegAttrs Attrs,
1079+
const SrcOp &Op);
1080+
10761081
/// Build and insert \p Res = G_BUILD_VECTOR \p Op0, ...
10771082
///
10781083
/// G_BUILD_VECTOR creates a vector value from multiple scalar registers.

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,12 @@ class MachineRegisterInfo {
674674
return dyn_cast_if_present<const TargetRegisterClass *>(Val);
675675
}
676676

677+
/// Return the register bank of \p Reg.
678+
/// This shouldn't be used directly unless \p Reg has a register bank.
679+
const RegisterBank *getRegBank(Register Reg) const {
680+
return cast<const RegisterBank *>(VRegInfo[Reg.id()].first);
681+
}
682+
677683
/// Return the register bank of \p Reg, or null if Reg has not been assigned
678684
/// a register bank or has been assigned a register class.
679685
/// \note It is possible to get the register bank from the register class via

llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,15 @@ MachineInstrBuilder MachineIRBuilder::buildUnmerge(LLT Res,
698698
return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
699699
}
700700

701+
MachineInstrBuilder
702+
MachineIRBuilder::buildUnmerge(MachineRegisterInfo::VRegAttrs Attrs,
703+
const SrcOp &Op) {
704+
LLT OpTy = Op.getLLTTy(*getMRI());
705+
unsigned NumRegs = OpTy.getSizeInBits() / Attrs.Ty.getSizeInBits();
706+
SmallVector<DstOp, 8> TmpVec(NumRegs, Attrs);
707+
return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
708+
}
709+
701710
MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<Register> Res,
702711
const SrcOp &Op) {
703712
// Unfortunately to convert from ArrayRef<Register> to ArrayRef<DstOp>,

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,59 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
106106
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
107107
}
108108
}
109+
110+
static LLT getReadAnyLaneSplitTy(LLT Ty) {
111+
if (Ty.isVector()) {
112+
LLT ElTy = Ty.getElementType();
113+
if (ElTy.getSizeInBits() == 16)
114+
return LLT::fixed_vector(2, ElTy);
115+
// S32, S64 or pointer
116+
return ElTy;
117+
}
118+
119+
// Large scalars and 64-bit pointers
120+
return LLT::scalar(32);
121+
}
122+
123+
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
124+
const RegisterBankInfo &RBI);
125+
126+
static void unmergeReadAnyLane(MachineIRBuilder &B,
127+
SmallVectorImpl<Register> &SgprDstParts,
128+
LLT UnmergeTy, Register VgprSrc,
129+
const RegisterBankInfo &RBI) {
130+
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
131+
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
132+
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
133+
SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
134+
}
135+
}
136+
137+
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
138+
const RegisterBankInfo &RBI) {
139+
LLT Ty = B.getMRI()->getType(VgprSrc);
140+
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
141+
if (Ty.getSizeInBits() == 32) {
142+
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
143+
.getReg(0);
144+
}
145+
146+
SmallVector<Register, 8> SgprDstParts;
147+
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
148+
149+
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
150+
}
151+
152+
void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
153+
Register VgprSrc, const RegisterBankInfo &RBI) {
154+
LLT Ty = B.getMRI()->getType(VgprSrc);
155+
if (Ty.getSizeInBits() == 32) {
156+
B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
157+
return;
158+
}
159+
160+
SmallVector<Register, 8> SgprDstParts;
161+
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
162+
163+
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
164+
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1010
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1111

12+
#include "AMDGPURegisterBankInfo.h"
13+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1214
#include "llvm/ADT/DenseSet.h"
15+
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
16+
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
1317
#include "llvm/CodeGen/MachineFunction.h"
1418
#include "llvm/CodeGen/Register.h"
1519
#include <utility>
@@ -48,7 +52,11 @@ class IntrinsicLaneMaskAnalyzer {
4852
// This will not be needed when we turn off LCSSA for global-isel.
4953
void findLCSSAPhi(Register Reg);
5054
};
51-
}
52-
}
55+
56+
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
57+
const RegisterBankInfo &RBI);
58+
59+
} // namespace AMDGPU
60+
} // namespace llvm
5361

5462
#endif

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
217217
return true;
218218
}
219219

220+
bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
221+
const DebugLoc &DL = I.getDebugLoc();
222+
MachineBasicBlock *BB = I.getParent();
223+
224+
unsigned CmpOpc =
225+
STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
226+
MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
227+
.addReg(I.getOperand(1).getReg())
228+
.addImm(0);
229+
if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
230+
return false;
231+
232+
Register DstReg = I.getOperand(0).getReg();
233+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
234+
235+
I.eraseFromParent();
236+
return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
237+
}
238+
239+
bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
240+
const DebugLoc &DL = I.getDebugLoc();
241+
MachineBasicBlock *BB = I.getParent();
242+
243+
Register DstReg = I.getOperand(0).getReg();
244+
Register SrcReg = I.getOperand(1).getReg();
245+
std::optional<ValueAndVReg> Arg =
246+
getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
247+
248+
if (Arg) {
249+
const int64_t Value = Arg->Value.getZExtValue();
250+
if (Value == 0) {
251+
unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
252+
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
253+
} else {
254+
assert(Value == 1);
255+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
256+
}
257+
I.eraseFromParent();
258+
return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
259+
}
260+
261+
// RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
262+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
263+
264+
unsigned SelectOpcode =
265+
STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
266+
MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
267+
.addReg(TRI.getExec())
268+
.addImm(0);
269+
270+
I.eraseFromParent();
271+
return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
272+
}
273+
274+
bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
275+
Register DstReg = I.getOperand(0).getReg();
276+
Register SrcReg = I.getOperand(1).getReg();
277+
278+
const DebugLoc &DL = I.getDebugLoc();
279+
MachineBasicBlock *BB = I.getParent();
280+
281+
auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
282+
.addReg(SrcReg);
283+
284+
I.eraseFromParent();
285+
return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
286+
}
287+
220288
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
221289
const Register DefReg = I.getOperand(0).getReg();
222290
const LLT DefTy = MRI->getType(DefReg);
@@ -249,7 +317,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
249317
}
250318
}
251319

252-
// TODO: Verify that all registers have the same bank
320+
// If inputs have register bank, assign corresponding reg class.
321+
// Note: registers don't need to have the same reg bank.
322+
for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
323+
const Register SrcReg = I.getOperand(i).getReg();
324+
325+
const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
326+
if (RB) {
327+
const LLT SrcTy = MRI->getType(SrcReg);
328+
const TargetRegisterClass *SrcRC =
329+
TRI.getRegClassForTypeOnBank(SrcTy, *RB);
330+
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
331+
return false;
332+
}
333+
}
334+
253335
I.setDesc(TII.get(TargetOpcode::PHI));
254336
return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
255337
}
@@ -4015,6 +4097,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
40154097
return selectStackRestore(I);
40164098
case AMDGPU::G_PHI:
40174099
return selectPHI(I);
4100+
case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4101+
return selectCOPY_SCC_VCC(I);
4102+
case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4103+
return selectCOPY_VCC_SCC(I);
4104+
case AMDGPU::G_AMDGPU_READANYLANE:
4105+
return selectReadAnyLane(I);
40184106
case TargetOpcode::G_CONSTANT:
40194107
case TargetOpcode::G_FCONSTANT:
40204108
default:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
8787

8888
bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const;
8989
bool selectCOPY(MachineInstr &I) const;
90+
bool selectCOPY_SCC_VCC(MachineInstr &I) const;
91+
bool selectCOPY_VCC_SCC(MachineInstr &I) const;
92+
bool selectReadAnyLane(MachineInstr &I) const;
9093
bool selectPHI(MachineInstr &I) const;
9194
bool selectG_TRUNC(MachineInstr &I) const;
9295
bool selectG_SZA_EXT(MachineInstr &I) const;

0 commit comments

Comments
 (0)