Skip to content

Commit 06f711a

Browse files
AMDGPU/GlobalISelDivergenceLowering: select divergent i1 phis (#80003)
Implement PhiLoweringHelper for GlobalISel in DivergenceLoweringHelper. Use machine uniformity analysis to find divergent i1 phis and select them as lane mask phis in same way SILowerI1Copies select VReg_1 phis. Note that divergent i1 phis include phis created by LCSSA and all cases of uses outside of cycle are actually covered by "lowering LCSSA phis". GlobalISel lane masks are registers with sgpr register class and S1 LLT. TODO: General goal is that instructions created in this pass are fully instruction-selected so that selection of lane mask phis is not split across multiple passes. patch 3 from: #73337
1 parent 89ec940 commit 06f711a

21 files changed

+827
-259
lines changed

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,24 @@ class MachineRegisterInfo {
752752
Register createVirtualRegister(const TargetRegisterClass *RegClass,
753753
StringRef Name = "");
754754

755+
/// All attributes(register class or bank and low-level type) a virtual
756+
/// register can have.
757+
struct VRegAttrs {
758+
RegClassOrRegBank RCOrRB;
759+
LLT Ty;
760+
};
761+
762+
/// Returns register class or bank and low level type of \p Reg. Always safe
763+
/// to use. Special values are returned when \p Reg does not have some of the
764+
/// attributes.
765+
VRegAttrs getVRegAttrs(Register Reg) {
766+
return {getRegClassOrRegBank(Reg), getType(Reg)};
767+
}
768+
769+
/// Create and return a new virtual register in the function with the
770+
/// specified register attributes(register class or bank and low level type).
771+
Register createVirtualRegister(VRegAttrs RegAttr, StringRef Name = "");
772+
755773
/// Create and return a new virtual register in the function with the same
756774
/// attributes as the given register.
757775
Register cloneVirtualRegister(Register VReg, StringRef Name = "");

llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,25 @@ MachineUniformityInfo computeMachineUniformityInfo(
3232
MachineFunction &F, const MachineCycleInfo &cycleInfo,
3333
const MachineDomTree &domTree, bool HasBranchDivergence);
3434

35+
/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
36+
class MachineUniformityAnalysisPass : public MachineFunctionPass {
37+
MachineUniformityInfo UI;
38+
39+
public:
40+
static char ID;
41+
42+
MachineUniformityAnalysisPass();
43+
44+
MachineUniformityInfo &getUniformityInfo() { return UI; }
45+
const MachineUniformityInfo &getUniformityInfo() const { return UI; }
46+
47+
bool runOnMachineFunction(MachineFunction &F) override;
48+
void getAnalysisUsage(AnalysisUsage &AU) const override;
49+
void print(raw_ostream &OS, const Module *M = nullptr) const override;
50+
51+
// TODO: verify analysis
52+
};
53+
3554
} // namespace llvm
3655

3756
#endif // LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H

llvm/lib/CodeGen/MachineRegisterInfo.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,15 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
167167
return Reg;
168168
}
169169

170+
Register MachineRegisterInfo::createVirtualRegister(VRegAttrs RegAttr,
171+
StringRef Name) {
172+
Register Reg = createIncompleteVirtualRegister(Name);
173+
VRegInfo[Reg].first = RegAttr.RCOrRB;
174+
setType(Reg, RegAttr.Ty);
175+
noteNewVirtualRegister(Reg);
176+
return Reg;
177+
}
178+
170179
Register MachineRegisterInfo::cloneVirtualRegister(Register VReg,
171180
StringRef Name) {
172181
Register Reg = createIncompleteVirtualRegister(Name);

llvm/lib/CodeGen/MachineUniformityAnalysis.cpp

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -165,25 +165,6 @@ MachineUniformityInfo llvm::computeMachineUniformityInfo(
165165

166166
namespace {
167167

168-
/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
169-
class MachineUniformityAnalysisPass : public MachineFunctionPass {
170-
MachineUniformityInfo UI;
171-
172-
public:
173-
static char ID;
174-
175-
MachineUniformityAnalysisPass();
176-
177-
MachineUniformityInfo &getUniformityInfo() { return UI; }
178-
const MachineUniformityInfo &getUniformityInfo() const { return UI; }
179-
180-
bool runOnMachineFunction(MachineFunction &F) override;
181-
void getAnalysisUsage(AnalysisUsage &AU) const override;
182-
void print(raw_ostream &OS, const Module *M = nullptr) const override;
183-
184-
// TODO: verify analysis
185-
};
186-
187168
class MachineUniformityInfoPrinterPass : public MachineFunctionPass {
188169
public:
189170
static char ID;

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp

Lines changed: 144 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "SILowerI1Copies.h"
20+
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
1921
#include "llvm/CodeGen/MachineFunctionPass.h"
22+
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
23+
#include "llvm/InitializePasses.h"
2024

2125
#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
2226

@@ -42,14 +46,146 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
4246

4347
void getAnalysisUsage(AnalysisUsage &AU) const override {
4448
AU.setPreservesCFG();
49+
AU.addRequired<MachineDominatorTree>();
50+
AU.addRequired<MachinePostDominatorTree>();
51+
AU.addRequired<MachineUniformityAnalysisPass>();
4552
MachineFunctionPass::getAnalysisUsage(AU);
4653
}
4754
};
4855

56+
class DivergenceLoweringHelper : public PhiLoweringHelper {
57+
public:
58+
DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
59+
MachinePostDominatorTree *PDT,
60+
MachineUniformityInfo *MUI);
61+
62+
private:
63+
MachineUniformityInfo *MUI = nullptr;
64+
MachineIRBuilder B;
65+
Register buildRegCopyToLaneMask(Register Reg);
66+
67+
public:
68+
void markAsLaneMask(Register DstReg) const override;
69+
void getCandidatesForLowering(
70+
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
71+
void collectIncomingValuesFromPhi(
72+
const MachineInstr *MI,
73+
SmallVectorImpl<Incoming> &Incomings) const override;
74+
void replaceDstReg(Register NewReg, Register OldReg,
75+
MachineBasicBlock *MBB) override;
76+
void buildMergeLaneMasks(MachineBasicBlock &MBB,
77+
MachineBasicBlock::iterator I, const DebugLoc &DL,
78+
Register DstReg, Register PrevReg,
79+
Register CurReg) override;
80+
void constrainAsLaneMask(Incoming &In) override;
81+
};
82+
83+
DivergenceLoweringHelper::DivergenceLoweringHelper(
84+
MachineFunction *MF, MachineDominatorTree *DT,
85+
MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
86+
: PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
87+
88+
// _(s1) -> SReg_32/64(s1)
89+
void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
90+
assert(MRI->getType(DstReg) == LLT::scalar(1));
91+
92+
if (MRI->getRegClassOrNull(DstReg)) {
93+
if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
94+
return;
95+
llvm_unreachable("Failed to constrain register class");
96+
}
97+
98+
MRI->setRegClass(DstReg, ST->getBoolRC());
99+
}
100+
101+
void DivergenceLoweringHelper::getCandidatesForLowering(
102+
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
103+
LLT S1 = LLT::scalar(1);
104+
105+
// Add divergent i1 phis to the list
106+
for (MachineBasicBlock &MBB : *MF) {
107+
for (MachineInstr &MI : MBB.phis()) {
108+
Register Dst = MI.getOperand(0).getReg();
109+
if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
110+
Vreg1Phis.push_back(&MI);
111+
}
112+
}
113+
}
114+
115+
void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
116+
const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
117+
for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
118+
Incomings.emplace_back(MI->getOperand(i).getReg(),
119+
MI->getOperand(i + 1).getMBB(), Register());
120+
}
121+
}
122+
123+
void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
124+
MachineBasicBlock *MBB) {
125+
BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
126+
.addReg(NewReg);
127+
}
128+
129+
// Copy Reg to new lane mask register, insert a copy after instruction that
130+
// defines Reg while skipping phis if needed.
131+
Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
132+
Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);
133+
MachineInstr *Instr = MRI->getVRegDef(Reg);
134+
MachineBasicBlock *MBB = Instr->getParent();
135+
B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
136+
B.buildCopy(LaneMask, Reg);
137+
return LaneMask;
138+
}
139+
140+
// bb.previous
141+
// %PrevReg = ...
142+
//
143+
// bb.current
144+
// %CurReg = ...
145+
//
146+
// %DstReg - not defined
147+
//
148+
// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
149+
//
150+
// bb.previous
151+
// %PrevReg = ...
152+
// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
153+
//
154+
// bb.current
155+
// %CurReg = ...
156+
// %CurRegCopy:sreg_32(s1) = COPY %CurReg
157+
// ...
158+
// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
159+
// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
160+
// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
161+
//
162+
// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
163+
void DivergenceLoweringHelper::buildMergeLaneMasks(
164+
MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
165+
Register DstReg, Register PrevReg, Register CurReg) {
166+
// DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
167+
// TODO: check if inputs are constants or results of a compare.
168+
169+
Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
170+
Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
171+
Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
172+
Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
173+
174+
B.setInsertPt(MBB, I);
175+
B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
176+
B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
177+
B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
178+
}
179+
180+
void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
181+
49182
} // End anonymous namespace.
50183

51184
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
52185
"AMDGPU GlobalISel divergence lowering", false, false)
186+
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
187+
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
188+
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
53189
INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
54190
"AMDGPU GlobalISel divergence lowering", false, false)
55191

@@ -64,5 +200,12 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
64200

65201
bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
66202
MachineFunction &MF) {
67-
return false;
203+
MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
204+
MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
205+
MachineUniformityInfo &MUI =
206+
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
207+
208+
DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
209+
210+
return Helper.lowerPhis();
68211
}

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
210210
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
211211
const Register DefReg = I.getOperand(0).getReg();
212212
const LLT DefTy = MRI->getType(DefReg);
213+
213214
if (DefTy == LLT::scalar(1)) {
214215
if (!AllowRiskySelect) {
215216
LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
@@ -3552,8 +3553,6 @@ bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
35523553
}
35533554

35543555
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3555-
if (I.isPHI())
3556-
return selectPHI(I);
35573556

35583557
if (!I.isPreISelOpcode()) {
35593558
if (I.isCopy())
@@ -3696,6 +3695,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
36963695
return selectWaveAddress(I);
36973696
case AMDGPU::G_STACKRESTORE:
36983697
return selectStackRestore(I);
3698+
case AMDGPU::G_PHI:
3699+
return selectPHI(I);
36993700
default:
37003701
return selectImpl(I, *CoverageInfo);
37013702
}

llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@
3131

3232
using namespace llvm;
3333

34-
static Register insertUndefLaneMask(MachineBasicBlock *MBB,
35-
MachineRegisterInfo *MRI,
36-
Register LaneMaskRegAttrs);
34+
static Register
35+
insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
36+
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);
3737

3838
namespace {
3939

@@ -78,7 +78,7 @@ class Vreg1LoweringHelper : public PhiLoweringHelper {
7878
MachineBasicBlock::iterator I, const DebugLoc &DL,
7979
Register DstReg, Register PrevReg,
8080
Register CurReg) override;
81-
void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
81+
void constrainAsLaneMask(Incoming &In) override;
8282

8383
bool lowerCopiesFromI1();
8484
bool lowerCopiesToI1();
@@ -304,7 +304,8 @@ class LoopFinder {
304304
/// blocks, so that the SSA updater doesn't have to search all the way to the
305305
/// function entry.
306306
void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
307-
MachineRegisterInfo &MRI, Register LaneMaskRegAttrs,
307+
MachineRegisterInfo &MRI,
308+
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs,
308309
ArrayRef<Incoming> Incomings = {}) {
309310
assert(LoopLevel < CommonDominators.size());
310311

@@ -411,14 +412,15 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
411412
return new SILowerI1Copies();
412413
}
413414

414-
Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
415-
Register LaneMaskRegAttrs) {
416-
return MRI->cloneVirtualRegister(LaneMaskRegAttrs);
415+
Register
416+
llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
417+
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
418+
return MRI->createVirtualRegister(LaneMaskRegAttrs);
417419
}
418420

419-
static Register insertUndefLaneMask(MachineBasicBlock *MBB,
420-
MachineRegisterInfo *MRI,
421-
Register LaneMaskRegAttrs) {
421+
static Register
422+
insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
423+
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
422424
MachineFunction &MF = *MBB->getParent();
423425
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
424426
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -619,7 +621,7 @@ bool PhiLoweringHelper::lowerPhis() {
619621
for (auto &Incoming : Incomings) {
620622
MachineBasicBlock &IMBB = *Incoming.Block;
621623
if (PIA.isSource(IMBB)) {
622-
constrainIncomingRegisterTakenAsIs(Incoming);
624+
constrainAsLaneMask(Incoming);
623625
SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
624626
} else {
625627
Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
@@ -911,6 +913,4 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
911913
}
912914
}
913915

914-
void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) {
915-
return;
916-
}
916+
void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {}

llvm/lib/Target/AMDGPU/SILowerI1Copies.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ struct Incoming {
3131
: Reg(Reg), Block(Block), UpdatedReg(UpdatedReg) {}
3232
};
3333

34-
Register createLaneMaskReg(MachineRegisterInfo *MRI, Register LaneMaskRegAttrs);
34+
Register createLaneMaskReg(MachineRegisterInfo *MRI,
35+
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);
3536

3637
class PhiLoweringHelper {
3738
public:
@@ -47,7 +48,7 @@ class PhiLoweringHelper {
4748
MachineRegisterInfo *MRI = nullptr;
4849
const GCNSubtarget *ST = nullptr;
4950
const SIInstrInfo *TII = nullptr;
50-
Register LaneMaskRegAttrs;
51+
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs;
5152

5253
#ifndef NDEBUG
5354
DenseSet<Register> PhiRegisters;
@@ -68,7 +69,7 @@ class PhiLoweringHelper {
6869
getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
6970

7071
void initializeLaneMaskRegisterAttributes(Register LaneMask) {
71-
LaneMaskRegAttrs = LaneMask;
72+
LaneMaskRegAttrs = MRI->getVRegAttrs(LaneMask);
7273
}
7374

7475
bool isLaneMaskReg(Register Reg) const {
@@ -91,7 +92,7 @@ class PhiLoweringHelper {
9192
MachineBasicBlock::iterator I,
9293
const DebugLoc &DL, Register DstReg,
9394
Register PrevReg, Register CurReg) = 0;
94-
virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0;
95+
virtual void constrainAsLaneMask(Incoming &In) = 0;
9596
};
9697

9798
} // end namespace llvm

0 commit comments

Comments
 (0)