Skip to content

Commit 2124eb3

Browse files
AMDGPU/GlobalISel: RBSelect
Assign register banks to virtual registers. Defs and uses of G_ instructions have register banks exclusively, if they had register class, reassign appropriate register bank. Assign register banks using machine uniformity analysis: SGPR - uniform values and some lane masks VGPR - divergent, non S1, values VCC - divergent S1 values(lane masks) RBSelect does not consider available instructions and, in some cases, G_ instructions with some register bank assignment can't be inst-selected. This is solved in RBLegalize. Exceptions when uniformity analysis does not work: S32/S64 lane masks: - need to end up with SGPR register class after instruction selection - In most cases Uniformity analysis declares them as uniform (forced by tablegen) resulting in sgpr S32/S64 reg bank - When Uniformity analysis declares them as divergent (some phis), use intrinsic lane mask analyzer to still assign sgpr register bank temporal divergence copy: - COPY to vgpr with implicit use of $exec inside of the cycle - this copy is declared as uniform by uniformity analysis - make sure that assigned bank is vgpr Note: uniformity analysis does not consider that registers with vgpr def are divergent (you can have uniform value in vgpr). - TODO: implicit use of $exec could be implemented as indicator that instruction is divergent
1 parent ff34aa1 commit 2124eb3

File tree

5 files changed

+971
-686
lines changed

5 files changed

+971
-686
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "AMDGPUGlobalISelUtils.h"
10+
#include "AMDGPURegisterBankInfo.h"
1011
#include "GCNSubtarget.h"
1112
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
1213
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
1314
#include "llvm/CodeGenTypes/LowLevelType.h"
1415
#include "llvm/IR/Constants.h"
16+
#include "llvm/IR/IntrinsicsAMDGPU.h"
1517

1618
using namespace llvm;
19+
using namespace AMDGPU;
1720
using namespace MIPatternMatch;
1821

1922
std::pair<Register, unsigned>
@@ -69,3 +72,38 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
6972

7073
return std::pair(Reg, 0);
7174
}
75+
76+
IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF)
77+
: MRI(MF.getRegInfo()) {
78+
initLaneMaskIntrinsics(MF);
79+
}
80+
81+
bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) {
82+
return S32S64LaneMask.contains(Reg);
83+
}
84+
85+
void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
86+
for (auto &MBB : MF) {
87+
for (auto &MI : MBB) {
88+
if (MI.getOpcode() == AMDGPU::G_INTRINSIC &&
89+
MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID() ==
90+
Intrinsic::amdgcn_if_break) {
91+
S32S64LaneMask.insert(MI.getOperand(3).getReg());
92+
findLCSSAPhi(MI.getOperand(0).getReg());
93+
}
94+
95+
if (MI.getOpcode() == AMDGPU::SI_IF ||
96+
MI.getOpcode() == AMDGPU::SI_ELSE) {
97+
findLCSSAPhi(MI.getOperand(0).getReg());
98+
}
99+
}
100+
}
101+
}
102+
103+
void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
104+
S32S64LaneMask.insert(Reg);
105+
for (auto &LCSSAPhi : MRI.use_instructions(Reg)) {
106+
if (LCSSAPhi.isPHI())
107+
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
108+
}
109+
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1010
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1111

12+
#include "llvm/ADT/DenseSet.h"
13+
#include "llvm/CodeGen/MachineFunction.h"
1214
#include "llvm/CodeGen/Register.h"
1315
#include <utility>
1416

@@ -26,6 +28,26 @@ std::pair<Register, unsigned>
2628
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
2729
GISelKnownBits *KnownBits = nullptr,
2830
bool CheckNUW = false);
31+
32+
// Currently finds S32/S64 lane masks that can be declared as divergent by
33+
// uniformity analysis (all are phis at the moment).
34+
// These are defined as i32/i64 in some IR intrinsics (not as i1).
35+
// Tablegen forces(via telling that lane mask IR intrinsics are uniform) most of
36+
// S32/S64 lane masks to be uniform, as this results in them ending up with sgpr
37+
// reg class after instruction-select don't search for all of them.
38+
class IntrinsicLaneMaskAnalyzer {
39+
DenseSet<Register> S32S64LaneMask;
40+
MachineRegisterInfo &MRI;
41+
42+
public:
43+
IntrinsicLaneMaskAnalyzer(MachineFunction &MF);
44+
bool isS32S64LaneMask(Register Reg);
45+
46+
private:
47+
void initLaneMaskIntrinsics(MachineFunction &MF);
48+
// This will not be needed when we turn of LCSSA for global-isel.
49+
void findLCSSAPhi(Register Reg);
50+
};
2951
}
3052
}
3153

llvm/lib/Target/AMDGPU/AMDGPURBSelect.cpp

Lines changed: 193 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,12 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "AMDGPUGlobalISelUtils.h"
20+
#include "AMDGPURegisterBankInfo.h"
21+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22+
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
1923
#include "llvm/CodeGen/MachineFunctionPass.h"
24+
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2025
#include "llvm/InitializePasses.h"
2126

2227
#define DEBUG_TYPE "rb-select"
@@ -39,6 +44,7 @@ class AMDGPURBSelect : public MachineFunctionPass {
3944
StringRef getPassName() const override { return "AMDGPU RB select"; }
4045

4146
void getAnalysisUsage(AnalysisUsage &AU) const override {
47+
AU.addRequired<MachineUniformityAnalysisPass>();
4248
MachineFunctionPass::getAnalysisUsage(AU);
4349
}
4450

@@ -54,6 +60,7 @@ class AMDGPURBSelect : public MachineFunctionPass {
5460

5561
INITIALIZE_PASS_BEGIN(AMDGPURBSelect, DEBUG_TYPE, "AMDGPU RB select", false,
5662
false)
63+
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
5764
INITIALIZE_PASS_END(AMDGPURBSelect, DEBUG_TYPE, "AMDGPU RB select", false,
5865
false)
5966

@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID;
6370

6471
FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); }
6572

66-
bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; }
73+
bool shouldRBSelect(MachineInstr &MI) {
74+
if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode())
75+
return false;
76+
77+
if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
78+
return false;
79+
80+
if (MI.isInlineAsm())
81+
return false;
82+
83+
return true;
84+
}
85+
86+
void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B,
87+
MachineRegisterInfo &MRI, const RegisterBank &RB) {
88+
Register Reg = DefOP.getReg();
89+
// Register that already has Register class got it during pre-inst selection
90+
// of another instruction. Maybe cross bank copy was required so we insert a
91+
// copy trat can be removed later. This simplifies post-rb-legalize artifact
92+
// combiner and avoids need to special case some patterns.
93+
if (MRI.getRegClassOrNull(Reg)) {
94+
LLT Ty = MRI.getType(Reg);
95+
Register NewReg = MRI.createVirtualRegister({&RB, Ty});
96+
DefOP.setReg(NewReg);
97+
98+
auto &MBB = *MI.getParent();
99+
B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI()
100+
: std::next(MI.getIterator()));
101+
B.buildCopy(Reg, NewReg);
102+
103+
// The problem was discoverd for uniform S1 that was used as both
104+
// lane mask(vcc) and regular sgpr S1.
105+
// - lane-mask(vcc) use was by si_if, this use is divergent and requires
106+
// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets
107+
// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask.
108+
// - the regular regular sgpr S1(uniform) instruction is now broken since
109+
// it uses sreg_64_xexec(S1) which is divergent.
110+
111+
// "Clear" reg classes from uses on generic instructions and but register
112+
// banks instead.
113+
for (auto &UseMI : MRI.use_instructions(Reg)) {
114+
if (shouldRBSelect(UseMI)) {
115+
for (MachineOperand &Op : UseMI.operands()) {
116+
if (Op.isReg() && Op.isUse() && Op.getReg() == Reg)
117+
Op.setReg(NewReg);
118+
}
119+
}
120+
}
121+
122+
} else {
123+
MRI.setRegBank(Reg, RB);
124+
}
125+
}
126+
127+
void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B,
128+
MachineRegisterInfo &MRI, const RegisterBank &RB) {
129+
Register Reg = UseOP.getReg();
130+
131+
LLT Ty = MRI.getType(Reg);
132+
Register NewReg = MRI.createVirtualRegister({&RB, Ty});
133+
UseOP.setReg(NewReg);
134+
135+
if (MI.isPHI()) {
136+
auto DefMI = MRI.getVRegDef(Reg)->getIterator();
137+
MachineBasicBlock *DefMBB = DefMI->getParent();
138+
B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
139+
} else {
140+
B.setInstr(MI);
141+
}
142+
143+
B.buildCopy(NewReg, Reg);
144+
}
145+
146+
// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
147+
// the cycle
148+
// Note: uniformity analysis does not consider that registers with vgpr def are
149+
// divergent (you can have uniform value in vgpr).
150+
// - TODO: implicit use of $exec could be implemented as indicator that
151+
// instruction is divergent
152+
bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) {
153+
MachineInstr *MI = MRI.getVRegDef(Reg);
154+
if (MI->getOpcode() == AMDGPU::COPY) {
155+
for (auto Op : MI->implicit_operands()) {
156+
if (!Op.isReg())
157+
continue;
158+
Register Reg = Op.getReg();
159+
if (Reg == AMDGPU::EXEC) {
160+
return true;
161+
}
162+
}
163+
}
164+
165+
return false;
166+
}
167+
168+
Register getVReg(MachineOperand &Op) {
169+
if (!Op.isReg())
170+
return 0;
171+
172+
Register Reg = Op.getReg();
173+
if (!Reg.isVirtual())
174+
return 0;
175+
176+
return Reg;
177+
}
178+
179+
bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) {
180+
MachineUniformityInfo &MUI =
181+
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
182+
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF);
183+
MachineRegisterInfo &MRI = MF.getRegInfo();
184+
const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo();
185+
186+
MachineIRBuilder B(MF);
187+
188+
// Assign register banks to ALL def registers on G_ instructions.
189+
// Same for copies if they have no register bank or class on def.
190+
for (MachineBasicBlock &MBB : MF) {
191+
for (MachineInstr &MI : MBB) {
192+
if (!shouldRBSelect(MI))
193+
continue;
194+
195+
for (MachineOperand &DefOP : MI.defs()) {
196+
Register DefReg = getVReg(DefOP);
197+
if (!DefReg)
198+
continue;
199+
200+
// Copies can have register class on def registers.
201+
if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) {
202+
continue;
203+
}
204+
205+
if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) {
206+
setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::SGPRRegBankID));
207+
} else {
208+
if (MRI.getType(DefReg) == LLT::scalar(1))
209+
setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::VCCRegBankID));
210+
else
211+
setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::VGPRRegBankID));
212+
}
213+
}
214+
}
215+
}
216+
217+
// At this point all virtual registers have register class or bank
218+
// - Defs of G_ instructions have register banks.
219+
// - Defs and uses of inst-selected instructions have register class.
220+
// - Defs and uses of copies can have either register class or bank
221+
// and most notably
222+
// - Uses of G_ instructions can have either register class or bank
223+
224+
// Reassign uses of G_ instructions to only have register banks.
225+
for (MachineBasicBlock &MBB : MF) {
226+
for (MachineInstr &MI : MBB) {
227+
if (!shouldRBSelect(MI))
228+
continue;
229+
230+
// Copies can have register class on use registers.
231+
if (MI.isCopy())
232+
continue;
233+
234+
for (MachineOperand &UseOP : MI.uses()) {
235+
Register UseReg = getVReg(UseOP);
236+
if (!UseReg)
237+
continue;
238+
239+
if (!MRI.getRegClassOrNull(UseReg))
240+
continue;
241+
242+
if (!isTemporalDivergenceCopy(UseReg, MRI) &&
243+
(MUI.isUniform(UseReg) || ILMA.isS32S64LaneMask(UseReg))) {
244+
setRBUse(MI, UseOP, B, MRI, RBI.getRegBank(AMDGPU::SGPRRegBankID));
245+
} else {
246+
if (MRI.getType(UseReg) == LLT::scalar(1))
247+
setRBUse(MI, UseOP, B, MRI, RBI.getRegBank(AMDGPU::VCCRegBankID));
248+
else
249+
setRBUse(MI, UseOP, B, MRI, RBI.getRegBank(AMDGPU::VGPRRegBankID));
250+
}
251+
}
252+
}
253+
}
254+
255+
// Defs and uses of G_ instructions have register banks exclusively.
256+
257+
return true;
258+
}

0 commit comments

Comments
 (0)