-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU/GlobalISel: AMDGPURegBankLegalize #112864
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,10 +7,13 @@ | |
//===----------------------------------------------------------------------===// | ||
|
||
#include "AMDGPUGlobalISelUtils.h" | ||
#include "AMDGPURegisterBankInfo.h" | ||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h" | ||
#include "llvm/ADT/DenseSet.h" | ||
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" | ||
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" | ||
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||
#include "llvm/CodeGenTypes/LowLevelType.h" | ||
#include "llvm/IR/Constants.h" | ||
#include "llvm/IR/IntrinsicsAMDGPU.h" | ||
|
@@ -106,3 +109,59 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) { | |
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg()); | ||
} | ||
} | ||
|
||
static LLT getReadAnyLaneSplitTy(LLT Ty) { | ||
if (Ty.isVector()) { | ||
LLT ElTy = Ty.getElementType(); | ||
if (ElTy.getSizeInBits() == 16) | ||
return LLT::fixed_vector(2, ElTy); | ||
// S32, S64 or pointer | ||
return ElTy; | ||
} | ||
|
||
// Large scalars and 64-bit pointers | ||
return LLT::scalar(32); | ||
} | ||
|
||
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move the function body to avoid forward declaring There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is circular dependency between buildReadAnyLane and unmergeReadAnyLane |
||
const RegisterBankInfo &RBI); | ||
|
||
static void unmergeReadAnyLane(MachineIRBuilder &B, | ||
SmallVectorImpl<Register> &SgprDstParts, | ||
LLT UnmergeTy, Register VgprSrc, | ||
const RegisterBankInfo &RBI) { | ||
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID); | ||
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc); | ||
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { | ||
SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI)); | ||
} | ||
} | ||
|
||
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, | ||
const RegisterBankInfo &RBI) { | ||
LLT Ty = B.getMRI()->getType(VgprSrc); | ||
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID); | ||
if (Ty.getSizeInBits() == 32) { | ||
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc}) | ||
.getReg(0); | ||
} | ||
|
||
SmallVector<Register, 8> SgprDstParts; | ||
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); | ||
|
||
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0); | ||
} | ||
|
||
void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, | ||
Register VgprSrc, const RegisterBankInfo &RBI) { | ||
LLT Ty = B.getMRI()->getType(VgprSrc); | ||
if (Ty.getSizeInBits() == 32) { | ||
B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc}); | ||
return; | ||
} | ||
|
||
SmallVector<Register, 8> SgprDstParts; | ||
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); | ||
|
||
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { | |
return true; | ||
} | ||
|
||
bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wondering if we should just avoid this situation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This allows for more registers to be allocated to sgpr. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it's trivial to avoid this a priori. I agree that a separate cleanup optimization could do it, in any case I'd say it's best left to a separate focused change. |
||
const DebugLoc &DL = I.getDebugLoc(); | ||
MachineBasicBlock *BB = I.getParent(); | ||
|
||
unsigned CmpOpc = | ||
STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; | ||
MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) | ||
.addReg(I.getOperand(1).getReg()) | ||
.addImm(0); | ||
if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI)) | ||
return false; | ||
|
||
Register DstReg = I.getOperand(0).getReg(); | ||
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC); | ||
|
||
I.eraseFromParent(); | ||
return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); | ||
} | ||
|
||
bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const { | ||
const DebugLoc &DL = I.getDebugLoc(); | ||
MachineBasicBlock *BB = I.getParent(); | ||
|
||
Register DstReg = I.getOperand(0).getReg(); | ||
Register SrcReg = I.getOperand(1).getReg(); | ||
std::optional<ValueAndVReg> Arg = | ||
getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI); | ||
|
||
if (Arg) { | ||
const int64_t Value = Arg->Value.getZExtValue(); | ||
if (Value == 0) { | ||
unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; | ||
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); | ||
} else { | ||
assert(Value == 1); | ||
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec()); | ||
} | ||
I.eraseFromParent(); | ||
return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI); | ||
} | ||
|
||
// RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0). | ||
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg); | ||
|
||
unsigned SelectOpcode = | ||
STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; | ||
MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) | ||
.addReg(TRI.getExec()) | ||
.addImm(0); | ||
|
||
I.eraseFromParent(); | ||
return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); | ||
} | ||
|
||
bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const { | ||
Register DstReg = I.getOperand(0).getReg(); | ||
Register SrcReg = I.getOperand(1).getReg(); | ||
|
||
const DebugLoc &DL = I.getDebugLoc(); | ||
MachineBasicBlock *BB = I.getParent(); | ||
|
||
auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg) | ||
.addReg(SrcReg); | ||
|
||
I.eraseFromParent(); | ||
return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI); | ||
} | ||
|
||
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { | ||
const Register DefReg = I.getOperand(0).getReg(); | ||
const LLT DefTy = MRI->getType(DefReg); | ||
|
@@ -249,7 +317,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { | |
} | ||
} | ||
|
||
// TODO: Verify that all registers have the same bank | ||
// If inputs have register bank, assign corresponding reg class. | ||
// Note: registers don't need to have the same reg bank. | ||
for (unsigned i = 1; i != I.getNumOperands(); i += 2) { | ||
const Register SrcReg = I.getOperand(i).getReg(); | ||
|
||
const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg); | ||
if (RB) { | ||
const LLT SrcTy = MRI->getType(SrcReg); | ||
const TargetRegisterClass *SrcRC = | ||
TRI.getRegClassForTypeOnBank(SrcTy, *RB); | ||
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) | ||
return false; | ||
} | ||
} | ||
|
||
I.setDesc(TII.get(TargetOpcode::PHI)); | ||
return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); | ||
} | ||
|
@@ -4014,6 +4096,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { | |
return selectStackRestore(I); | ||
case AMDGPU::G_PHI: | ||
return selectPHI(I); | ||
case AMDGPU::G_AMDGPU_COPY_SCC_VCC: | ||
return selectCOPY_SCC_VCC(I); | ||
case AMDGPU::G_AMDGPU_COPY_VCC_SCC: | ||
return selectCOPY_VCC_SCC(I); | ||
case AMDGPU::G_AMDGPU_READANYLANE: | ||
return selectReadAnyLane(I); | ||
case TargetOpcode::G_CONSTANT: | ||
case TargetOpcode::G_FCONSTANT: | ||
default: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
divideCoefficientBy? Does unmerge on scalable vectors work?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
divideCoefficientBy works only for vectors, we need to unmerge scalars also. Don't know about scalable vectors, they seem to only be used to say something is legal, don't know if there is actual lowering done for them. Other places in builder don't check for them.