Skip to content

Commit 813ee0e

Browse files
AMDGPU/GlobalISel: AMDGPURegBankLegalize
Lower G_ instructions that can't be inst-selected with register bank assignment from AMDGPURegBankSelect based on uniformity analysis. - Lower instruction to perform it on assigned register bank - Put uniform value in vgpr because SALU instruction is not available - Execute divergent instruction in SALU - "waterfall loop" Given LLTs on all operands after legalizer, some register bank assignments require lowering while other do not. Note: cases where all register bank assignments would require lowering are lowered in legalizer. AMDGPURegBankLegalize goals: - Define Rules: when and how to perform lowering - Goal of defining Rules it to provide high level table-like brief overview of how to lower generic instructions based on available target features and uniformity info (uniform vs divergent). - Fast search of Rules, depends on how complicated Rule.Predicate is - For some opcodes there would be too many Rules that are essentially all the same just for different combinations of types and banks. Write custom function that handles all cases. - Rules are made from enum IDs that correspond to each operand. Names of IDs are meant to give brief description what lowering does for each operand or the whole instruction. - AMDGPURegBankLegalizeHelper implements lowering algorithms Since this is the first patch that actually enables -new-reg-bank-select here is the summary of regression tests that were added earlier: - if instruction is uniform always select SALU instruction if available - eliminate back to back vgpr to sgpr to vgpr copies of uniform values - fast rules: small differences for standard and vector instruction - enabling Rule based on target feature - salu_float - how to specify lowering algorithm - vgpr S64 AND to S32 - on G_TRUNC in reg, it is up to user to deal with truncated bits G_TRUNC in reg is treated as no-op. - dealing with truncated high bits - ABS S16 to S32 - sgpr S1 phi lowering - new opcodes for vcc-to-scc and scc-to-vcc copies - lowering for vgprS1-to-vcc copy (formally this is vgpr-to-vcc G_TRUNC) - S1 zext and sext lowering to select - uniform and divergent S1 AND(OR and XOR) lowering - inst-selected into SALU instruction - divergent phi with uniform inputs - divergent instruction with temporal divergent use, source instruction is defined as uniform(AMDGPURegBankSelect) - missing temporal divergence lowering - uniform phi, because of undef incoming, is assigned to vgpr. Will be fixed in AMDGPURegBankSelect via another fix in machine uniformity analysis.
1 parent 1c5977e commit 813ee0e

19 files changed

+1977
-253
lines changed

llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,6 +1073,11 @@ class MachineIRBuilder {
10731073
/// Build and insert an unmerge of \p Res sized pieces to cover \p Op
10741074
MachineInstrBuilder buildUnmerge(LLT Res, const SrcOp &Op);
10751075

1076+
/// Build and insert an unmerge of pieces with \p Attrs register attributes to
1077+
/// cover \p Op
1078+
MachineInstrBuilder buildUnmerge(MachineRegisterInfo::VRegAttrs Attrs,
1079+
const SrcOp &Op);
1080+
10761081
/// Build and insert \p Res = G_BUILD_VECTOR \p Op0, ...
10771082
///
10781083
/// G_BUILD_VECTOR creates a vector value from multiple scalar registers.

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,12 @@ class MachineRegisterInfo {
674674
return dyn_cast_if_present<const TargetRegisterClass *>(Val);
675675
}
676676

677+
/// Return the register bank of \p Reg.
678+
/// This shouldn't be used directly unless \p Reg has a register bank.
679+
const RegisterBank *getRegBank(Register Reg) const {
680+
return cast<const RegisterBank *>(VRegInfo[Reg.id()].first);
681+
}
682+
677683
/// Return the register bank of \p Reg, or null if Reg has not been assigned
678684
/// a register bank or has been assigned a register class.
679685
/// \note It is possible to get the register bank from the register class via

llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,15 @@ MachineInstrBuilder MachineIRBuilder::buildUnmerge(LLT Res,
698698
return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
699699
}
700700

701+
MachineInstrBuilder
702+
MachineIRBuilder::buildUnmerge(MachineRegisterInfo::VRegAttrs Attrs,
703+
const SrcOp &Op) {
704+
LLT OpTy = Op.getLLTTy(*getMRI());
705+
unsigned NumRegs = OpTy.getSizeInBits() / Attrs.Ty.getSizeInBits();
706+
SmallVector<DstOp, 8> TmpVec(NumRegs, Attrs);
707+
return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
708+
}
709+
701710
MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<Register> Res,
702711
const SrcOp &Op) {
703712
// Unfortunately to convert from ArrayRef<Register> to ArrayRef<DstOp>,

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,13 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "AMDGPUGlobalISelUtils.h"
10+
#include "AMDGPURegisterBankInfo.h"
1011
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12+
#include "llvm/ADT/DenseSet.h"
1113
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
1214
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
1315
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
16+
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
1417
#include "llvm/CodeGenTypes/LowLevelType.h"
1518
#include "llvm/IR/Constants.h"
1619
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -106,3 +109,59 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
106109
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
107110
}
108111
}
112+
113+
static LLT getReadAnyLaneSplitTy(LLT Ty) {
114+
if (Ty.isVector()) {
115+
LLT ElTy = Ty.getElementType();
116+
if (ElTy.getSizeInBits() == 16)
117+
return LLT::fixed_vector(2, ElTy);
118+
// S32, S64 or pointer
119+
return ElTy;
120+
}
121+
122+
// Large scalars and 64-bit pointers
123+
return LLT::scalar(32);
124+
}
125+
126+
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
127+
const RegisterBankInfo &RBI);
128+
129+
static void unmergeReadAnyLane(MachineIRBuilder &B,
130+
SmallVectorImpl<Register> &SgprDstParts,
131+
LLT UnmergeTy, Register VgprSrc,
132+
const RegisterBankInfo &RBI) {
133+
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
134+
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
135+
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
136+
SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
137+
}
138+
}
139+
140+
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
141+
const RegisterBankInfo &RBI) {
142+
LLT Ty = B.getMRI()->getType(VgprSrc);
143+
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
144+
if (Ty.getSizeInBits() == 32) {
145+
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
146+
.getReg(0);
147+
}
148+
149+
SmallVector<Register, 8> SgprDstParts;
150+
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
151+
152+
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
153+
}
154+
155+
void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
156+
Register VgprSrc, const RegisterBankInfo &RBI) {
157+
LLT Ty = B.getMRI()->getType(VgprSrc);
158+
if (Ty.getSizeInBits() == 32) {
159+
B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
160+
return;
161+
}
162+
163+
SmallVector<Register, 8> SgprDstParts;
164+
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
165+
166+
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
167+
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ class GCNSubtarget;
2020
class GISelKnownBits;
2121
class LLT;
2222
class MachineFunction;
23+
class MachineIRBuilder;
24+
class RegisterBankInfo;
2325

2426
namespace AMDGPU {
2527

@@ -48,6 +50,9 @@ class IntrinsicLaneMaskAnalyzer {
4850
// This will not be needed when we turn off LCSSA for global-isel.
4951
void findLCSSAPhi(Register Reg);
5052
};
53+
54+
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
55+
const RegisterBankInfo &RBI);
5156
}
5257
}
5358

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
217217
return true;
218218
}
219219

220+
bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
221+
const DebugLoc &DL = I.getDebugLoc();
222+
MachineBasicBlock *BB = I.getParent();
223+
224+
unsigned CmpOpc =
225+
STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
226+
MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
227+
.addReg(I.getOperand(1).getReg())
228+
.addImm(0);
229+
if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
230+
return false;
231+
232+
Register DstReg = I.getOperand(0).getReg();
233+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
234+
235+
I.eraseFromParent();
236+
return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
237+
}
238+
239+
bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
240+
const DebugLoc &DL = I.getDebugLoc();
241+
MachineBasicBlock *BB = I.getParent();
242+
243+
Register DstReg = I.getOperand(0).getReg();
244+
Register SrcReg = I.getOperand(1).getReg();
245+
std::optional<ValueAndVReg> Arg =
246+
getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
247+
248+
if (Arg) {
249+
const int64_t Value = Arg->Value.getZExtValue();
250+
if (Value == 0) {
251+
unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
252+
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
253+
} else {
254+
assert(Value == 1);
255+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
256+
}
257+
I.eraseFromParent();
258+
return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
259+
}
260+
261+
// RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
262+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
263+
264+
unsigned SelectOpcode =
265+
STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
266+
MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
267+
.addReg(TRI.getExec())
268+
.addImm(0);
269+
270+
I.eraseFromParent();
271+
return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
272+
}
273+
274+
bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
275+
Register DstReg = I.getOperand(0).getReg();
276+
Register SrcReg = I.getOperand(1).getReg();
277+
278+
const DebugLoc &DL = I.getDebugLoc();
279+
MachineBasicBlock *BB = I.getParent();
280+
281+
auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
282+
.addReg(SrcReg);
283+
284+
I.eraseFromParent();
285+
return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
286+
}
287+
220288
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
221289
const Register DefReg = I.getOperand(0).getReg();
222290
const LLT DefTy = MRI->getType(DefReg);
@@ -249,7 +317,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
249317
}
250318
}
251319

252-
// TODO: Verify that all registers have the same bank
320+
// If inputs have register bank, assign corresponding reg class.
321+
// Note: registers don't need to have the same reg bank.
322+
for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
323+
const Register SrcReg = I.getOperand(i).getReg();
324+
325+
const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
326+
if (RB) {
327+
const LLT SrcTy = MRI->getType(SrcReg);
328+
const TargetRegisterClass *SrcRC =
329+
TRI.getRegClassForTypeOnBank(SrcTy, *RB);
330+
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
331+
return false;
332+
}
333+
}
334+
253335
I.setDesc(TII.get(TargetOpcode::PHI));
254336
return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
255337
}
@@ -4014,6 +4096,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
40144096
return selectStackRestore(I);
40154097
case AMDGPU::G_PHI:
40164098
return selectPHI(I);
4099+
case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4100+
return selectCOPY_SCC_VCC(I);
4101+
case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4102+
return selectCOPY_VCC_SCC(I);
4103+
case AMDGPU::G_AMDGPU_READANYLANE:
4104+
return selectReadAnyLane(I);
40174105
case TargetOpcode::G_CONSTANT:
40184106
case TargetOpcode::G_FCONSTANT:
40194107
default:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
8787

8888
bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const;
8989
bool selectCOPY(MachineInstr &I) const;
90+
bool selectCOPY_SCC_VCC(MachineInstr &I) const;
91+
bool selectCOPY_VCC_SCC(MachineInstr &I) const;
92+
bool selectReadAnyLane(MachineInstr &I) const;
9093
bool selectPHI(MachineInstr &I) const;
9194
bool selectG_TRUNC(MachineInstr &I) const;
9295
bool selectG_SZA_EXT(MachineInstr &I) const;

0 commit comments

Comments
 (0)