Skip to content

AMDGPU/GlobalISelDivergenceLowering: select divergent i1 phis #80003

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions llvm/include/llvm/CodeGen/MachineRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,24 @@ class MachineRegisterInfo {
Register createVirtualRegister(const TargetRegisterClass *RegClass,
StringRef Name = "");

/// All attributes(register class or bank and low-level type) a virtual
/// register can have.
struct VRegAttrs {
RegClassOrRegBank RCOrRB;
LLT Ty;
};

/// Returns register class or bank and low level type of \p Reg. Always safe
/// to use. Special values are returned when \p Reg does not have some of the
/// attributes.
VRegAttrs getVRegAttrs(Register Reg) {
return {getRegClassOrRegBank(Reg), getType(Reg)};
}

/// Create and return a new virtual register in the function with the
/// specified register attributes(register class or bank and low level type).
Register createVirtualRegister(VRegAttrs RegAttr, StringRef Name = "");

/// Create and return a new virtual register in the function with the same
/// attributes as the given register.
Register cloneVirtualRegister(Register VReg, StringRef Name = "");
Expand Down
19 changes: 19 additions & 0 deletions llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,25 @@ MachineUniformityInfo computeMachineUniformityInfo(
MachineFunction &F, const MachineCycleInfo &cycleInfo,
const MachineDomTree &domTree, bool HasBranchDivergence);

/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
class MachineUniformityAnalysisPass : public MachineFunctionPass {
MachineUniformityInfo UI;

public:
static char ID;

MachineUniformityAnalysisPass();

MachineUniformityInfo &getUniformityInfo() { return UI; }
const MachineUniformityInfo &getUniformityInfo() const { return UI; }

bool runOnMachineFunction(MachineFunction &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
void print(raw_ostream &OS, const Module *M = nullptr) const override;

// TODO: verify analysis
};

} // namespace llvm

#endif // LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H
9 changes: 9 additions & 0 deletions llvm/lib/CodeGen/MachineRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,15 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
return Reg;
}

Register MachineRegisterInfo::createVirtualRegister(VRegAttrs RegAttr,
StringRef Name) {
Register Reg = createIncompleteVirtualRegister(Name);
VRegInfo[Reg].first = RegAttr.RCOrRB;
setType(Reg, RegAttr.Ty);
noteNewVirtualRegister(Reg);
return Reg;
}

Register MachineRegisterInfo::cloneVirtualRegister(Register VReg,
StringRef Name) {
Register Reg = createIncompleteVirtualRegister(Name);
Expand Down
19 changes: 0 additions & 19 deletions llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,25 +165,6 @@ MachineUniformityInfo llvm::computeMachineUniformityInfo(

namespace {

/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
class MachineUniformityAnalysisPass : public MachineFunctionPass {
MachineUniformityInfo UI;

public:
static char ID;

MachineUniformityAnalysisPass();

MachineUniformityInfo &getUniformityInfo() { return UI; }
const MachineUniformityInfo &getUniformityInfo() const { return UI; }

bool runOnMachineFunction(MachineFunction &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
void print(raw_ostream &OS, const Module *M = nullptr) const override;

// TODO: verify analysis
};

class MachineUniformityInfoPrinterPass : public MachineFunctionPass {
public:
static char ID;
Expand Down
145 changes: 144 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "SILowerI1Copies.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/InitializePasses.h"

#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"

Expand All @@ -42,14 +46,146 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineDominatorTree>();
AU.addRequired<MachinePostDominatorTree>();
AU.addRequired<MachineUniformityAnalysisPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};

class DivergenceLoweringHelper : public PhiLoweringHelper {
public:
DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
MachinePostDominatorTree *PDT,
MachineUniformityInfo *MUI);

private:
MachineUniformityInfo *MUI = nullptr;
MachineIRBuilder B;
Register buildRegCopyToLaneMask(Register Reg);

public:
void markAsLaneMask(Register DstReg) const override;
void getCandidatesForLowering(
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
void collectIncomingValuesFromPhi(
const MachineInstr *MI,
SmallVectorImpl<Incoming> &Incomings) const override;
void replaceDstReg(Register NewReg, Register OldReg,
MachineBasicBlock *MBB) override;
void buildMergeLaneMasks(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
Register DstReg, Register PrevReg,
Register CurReg) override;
void constrainAsLaneMask(Incoming &In) override;
};

DivergenceLoweringHelper::DivergenceLoweringHelper(
MachineFunction *MF, MachineDominatorTree *DT,
MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
: PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}

// _(s1) -> SReg_32/64(s1)
void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
assert(MRI->getType(DstReg) == LLT::scalar(1));

if (MRI->getRegClassOrNull(DstReg)) {
if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
return;
llvm_unreachable("Failed to constrain register class");
}

MRI->setRegClass(DstReg, ST->getBoolRC());
}

void DivergenceLoweringHelper::getCandidatesForLowering(
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
LLT S1 = LLT::scalar(1);

// Add divergent i1 phis to the list
for (MachineBasicBlock &MBB : *MF) {
for (MachineInstr &MI : MBB.phis()) {
Register Dst = MI.getOperand(0).getReg();
if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
Vreg1Phis.push_back(&MI);
}
}
}

void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
Incomings.emplace_back(MI->getOperand(i).getReg(),
MI->getOperand(i + 1).getMBB(), Register());
}
}

void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
MachineBasicBlock *MBB) {
BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
.addReg(NewReg);
}

// Copy Reg to new lane mask register, insert a copy after instruction that
// defines Reg while skipping phis if needed.
Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);
MachineInstr *Instr = MRI->getVRegDef(Reg);
MachineBasicBlock *MBB = Instr->getParent();
B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
B.buildCopy(LaneMask, Reg);
return LaneMask;
}

// bb.previous
// %PrevReg = ...
//
// bb.current
// %CurReg = ...
//
// %DstReg - not defined
//
// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
//
// bb.previous
// %PrevReg = ...
// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
//
// bb.current
// %CurReg = ...
// %CurRegCopy:sreg_32(s1) = COPY %CurReg
// ...
// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
//
// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
void DivergenceLoweringHelper::buildMergeLaneMasks(
MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
Register DstReg, Register PrevReg, Register CurReg) {
// DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
// TODO: check if inputs are constants or results of a compare.

Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);

B.setInsertPt(MBB, I);
B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
}

void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }

} // End anonymous namespace.

INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
"AMDGPU GlobalISel divergence lowering", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
"AMDGPU GlobalISel divergence lowering", false, false)

Expand All @@ -64,5 +200,12 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {

bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
MachineFunction &MF) {
return false;
MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
MachineUniformityInfo &MUI =
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();

DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);

return Helper.lowerPhis();
}
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI->getType(DefReg);

if (DefTy == LLT::scalar(1)) {
if (!AllowRiskySelect) {
LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
Expand Down Expand Up @@ -3552,8 +3553,6 @@ bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
}

bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);

if (!I.isPreISelOpcode()) {
if (I.isCopy())
Expand Down Expand Up @@ -3696,6 +3695,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectWaveAddress(I);
case AMDGPU::G_STACKRESTORE:
return selectStackRestore(I);
case AMDGPU::G_PHI:
return selectPHI(I);
default:
return selectImpl(I, *CoverageInfo);
}
Expand Down
30 changes: 15 additions & 15 deletions llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@

using namespace llvm;

static Register insertUndefLaneMask(MachineBasicBlock *MBB,
MachineRegisterInfo *MRI,
Register LaneMaskRegAttrs);
static Register
insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);

namespace {

Expand Down Expand Up @@ -78,7 +78,7 @@ class Vreg1LoweringHelper : public PhiLoweringHelper {
MachineBasicBlock::iterator I, const DebugLoc &DL,
Register DstReg, Register PrevReg,
Register CurReg) override;
void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
void constrainAsLaneMask(Incoming &In) override;

bool lowerCopiesFromI1();
bool lowerCopiesToI1();
Expand Down Expand Up @@ -304,7 +304,8 @@ class LoopFinder {
/// blocks, so that the SSA updater doesn't have to search all the way to the
/// function entry.
void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
MachineRegisterInfo &MRI, Register LaneMaskRegAttrs,
MachineRegisterInfo &MRI,
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs,
ArrayRef<Incoming> Incomings = {}) {
assert(LoopLevel < CommonDominators.size());

Expand Down Expand Up @@ -411,14 +412,15 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
return new SILowerI1Copies();
}

Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
Register LaneMaskRegAttrs) {
return MRI->cloneVirtualRegister(LaneMaskRegAttrs);
Register
llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
return MRI->createVirtualRegister(LaneMaskRegAttrs);
}

static Register insertUndefLaneMask(MachineBasicBlock *MBB,
MachineRegisterInfo *MRI,
Register LaneMaskRegAttrs) {
static Register
insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
MachineFunction &MF = *MBB->getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
Expand Down Expand Up @@ -619,7 +621,7 @@ bool PhiLoweringHelper::lowerPhis() {
for (auto &Incoming : Incomings) {
MachineBasicBlock &IMBB = *Incoming.Block;
if (PIA.isSource(IMBB)) {
constrainIncomingRegisterTakenAsIs(Incoming);
constrainAsLaneMask(Incoming);
SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
} else {
Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
Expand Down Expand Up @@ -911,6 +913,4 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
}
}

void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) {
return;
}
void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {}
9 changes: 5 additions & 4 deletions llvm/lib/Target/AMDGPU/SILowerI1Copies.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ struct Incoming {
: Reg(Reg), Block(Block), UpdatedReg(UpdatedReg) {}
};

Register createLaneMaskReg(MachineRegisterInfo *MRI, Register LaneMaskRegAttrs);
Register createLaneMaskReg(MachineRegisterInfo *MRI,
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);

class PhiLoweringHelper {
public:
Expand All @@ -47,7 +48,7 @@ class PhiLoweringHelper {
MachineRegisterInfo *MRI = nullptr;
const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
Register LaneMaskRegAttrs;
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs;

#ifndef NDEBUG
DenseSet<Register> PhiRegisters;
Expand All @@ -68,7 +69,7 @@ class PhiLoweringHelper {
getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;

void initializeLaneMaskRegisterAttributes(Register LaneMask) {
LaneMaskRegAttrs = LaneMask;
LaneMaskRegAttrs = MRI->getVRegAttrs(LaneMask);
}

bool isLaneMaskReg(Register Reg) const {
Expand All @@ -91,7 +92,7 @@ class PhiLoweringHelper {
MachineBasicBlock::iterator I,
const DebugLoc &DL, Register DstReg,
Register PrevReg, Register CurReg) = 0;
virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0;
virtual void constrainAsLaneMask(Incoming &In) = 0;
};

} // end namespace llvm
Loading