Skip to content

AMDGPU/GlobalISel: AMDGPURegBankLegalize #112864

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,11 @@ class MachineIRBuilder {
/// Build and insert an unmerge of \p Res sized pieces to cover \p Op
MachineInstrBuilder buildUnmerge(LLT Res, const SrcOp &Op);

/// Build and insert an unmerge of pieces with \p Attrs register attributes to
/// cover \p Op
MachineInstrBuilder buildUnmerge(MachineRegisterInfo::VRegAttrs Attrs,
const SrcOp &Op);

/// Build and insert \p Res = G_BUILD_VECTOR \p Op0, ...
///
/// G_BUILD_VECTOR creates a vector value from multiple scalar registers.
Expand Down
6 changes: 6 additions & 0 deletions llvm/include/llvm/CodeGen/MachineRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,12 @@ class MachineRegisterInfo {
return dyn_cast_if_present<const TargetRegisterClass *>(Val);
}

/// Return the register bank of \p Reg.
/// This shouldn't be used directly unless \p Reg has a register bank.
const RegisterBank *getRegBank(Register Reg) const {
return cast<const RegisterBank *>(VRegInfo[Reg.id()].first);
}

/// Return the register bank of \p Reg, or null if Reg has not been assigned
/// a register bank or has been assigned a register class.
/// \note It is possible to get the register bank from the register class via
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -698,6 +698,15 @@ MachineInstrBuilder MachineIRBuilder::buildUnmerge(LLT Res,
return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
}

MachineInstrBuilder
MachineIRBuilder::buildUnmerge(MachineRegisterInfo::VRegAttrs Attrs,
const SrcOp &Op) {
LLT OpTy = Op.getLLTTy(*getMRI());
unsigned NumRegs = OpTy.getSizeInBits() / Attrs.Ty.getSizeInBits();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

divideCoefficientBy? Does unmerge on scalable vectors work?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

divideCoefficientBy works only for vectors, we need to unmerge scalars also. Don't know about scalable vectors, they seem to only be used to say something is legal, don't know if there is actual lowering done for them. Other places in builder don't check for them.

SmallVector<DstOp, 8> TmpVec(NumRegs, Attrs);
return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
}

MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<Register> Res,
const SrcOp &Op) {
// Unfortunately to convert from ArrayRef<Register> to ArrayRef<DstOp>,
Expand Down
59 changes: 59 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
//===----------------------------------------------------------------------===//

#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPURegisterBankInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGenTypes/LowLevelType.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
Expand Down Expand Up @@ -106,3 +109,59 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
}
}

static LLT getReadAnyLaneSplitTy(LLT Ty) {
if (Ty.isVector()) {
LLT ElTy = Ty.getElementType();
if (ElTy.getSizeInBits() == 16)
return LLT::fixed_vector(2, ElTy);
// S32, S64 or pointer
return ElTy;
}

// Large scalars and 64-bit pointers
return LLT::scalar(32);
}

static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move the function body to avoid forward declaring

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is circular dependency between buildReadAnyLane and unmergeReadAnyLane

const RegisterBankInfo &RBI);

static void unmergeReadAnyLane(MachineIRBuilder &B,
SmallVectorImpl<Register> &SgprDstParts,
LLT UnmergeTy, Register VgprSrc,
const RegisterBankInfo &RBI) {
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
}
}

static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
const RegisterBankInfo &RBI) {
LLT Ty = B.getMRI()->getType(VgprSrc);
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
if (Ty.getSizeInBits() == 32) {
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
.getReg(0);
}

SmallVector<Register, 8> SgprDstParts;
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);

return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
}

void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
Register VgprSrc, const RegisterBankInfo &RBI) {
LLT Ty = B.getMRI()->getType(VgprSrc);
if (Ty.getSizeInBits() == 32) {
B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
return;
}

SmallVector<Register, 8> SgprDstParts;
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);

B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
}
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ class GCNSubtarget;
class GISelKnownBits;
class LLT;
class MachineFunction;
class MachineIRBuilder;
class RegisterBankInfo;

namespace AMDGPU {

Expand Down Expand Up @@ -48,6 +50,9 @@ class IntrinsicLaneMaskAnalyzer {
// This will not be needed when we turn off LCSSA for global-isel.
void findLCSSAPhi(Register Reg);
};

void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
}
}

Expand Down
90 changes: 89 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
return true;
}

bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if we should just avoid this situation

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This allows for more registers to be allocated to sgpr.
Avoiding them could be done later in some optimization pass when we can decide if we want to do global-isel equivalent of moveToVALU, but we don't do it for correctness but to influence register allocation to sgpr or vgpr.
Why would we want to avoid vcc to scc copy? Question for @nhaehnle also.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it's trivial to avoid this a priori. I agree that a separate cleanup optimization could do it, in any case I'd say it's best left to a separate focused change.

const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();

unsigned CmpOpc =
STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
.addReg(I.getOperand(1).getReg())
.addImm(0);
if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
return false;

Register DstReg = I.getOperand(0).getReg();
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);

I.eraseFromParent();
return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
}

bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();

Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
std::optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);

if (Arg) {
const int64_t Value = Arg->Value.getZExtValue();
if (Value == 0) {
unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
} else {
assert(Value == 1);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
}
I.eraseFromParent();
return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
}

// RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);

unsigned SelectOpcode =
STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
.addReg(TRI.getExec())
.addImm(0);

I.eraseFromParent();
return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
}

bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();

const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();

auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.addReg(SrcReg);

I.eraseFromParent();
return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
}

bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI->getType(DefReg);
Expand Down Expand Up @@ -249,7 +317,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
}
}

// TODO: Verify that all registers have the same bank
// If inputs have register bank, assign corresponding reg class.
// Note: registers don't need to have the same reg bank.
for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
const Register SrcReg = I.getOperand(i).getReg();

const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
if (RB) {
const LLT SrcTy = MRI->getType(SrcReg);
const TargetRegisterClass *SrcRC =
TRI.getRegClassForTypeOnBank(SrcTy, *RB);
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
return false;
}
}

I.setDesc(TII.get(TargetOpcode::PHI));
return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
}
Expand Down Expand Up @@ -4014,6 +4096,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectStackRestore(I);
case AMDGPU::G_PHI:
return selectPHI(I);
case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
return selectCOPY_SCC_VCC(I);
case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
return selectCOPY_VCC_SCC(I);
case AMDGPU::G_AMDGPU_READANYLANE:
return selectReadAnyLane(I);
case TargetOpcode::G_CONSTANT:
case TargetOpcode::G_FCONSTANT:
default:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {

bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const;
bool selectCOPY(MachineInstr &I) const;
bool selectCOPY_SCC_VCC(MachineInstr &I) const;
bool selectCOPY_VCC_SCC(MachineInstr &I) const;
bool selectReadAnyLane(MachineInstr &I) const;
bool selectPHI(MachineInstr &I) const;
bool selectG_TRUNC(MachineInstr &I) const;
bool selectG_SZA_EXT(MachineInstr &I) const;
Expand Down
Loading
Loading