Skip to content

Reland [AMDGPU] Support block load/store for CSR #130013 #137169

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llvm/include/llvm/CodeGen/MachineFrameInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class CalleeSavedInfo {
MCRegister getReg() const { return Reg; }
int getFrameIdx() const { return FrameIdx; }
MCRegister getDstReg() const { return DstReg; }
void setReg(MCRegister R) { Reg = R; }
void setFrameIdx(int FI) {
FrameIdx = FI;
SpilledToReg = false;
Expand Down
17 changes: 17 additions & 0 deletions llvm/include/llvm/CodeGen/TargetFrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,14 @@ class TargetFrameLowering {
return false;
}

/// spillCalleeSavedRegister - Default implementation for spilling a single
/// callee saved register.
void spillCalleeSavedRegister(MachineBasicBlock &SaveBlock,
MachineBasicBlock::iterator MI,
const CalleeSavedInfo &CS,
const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) const;

/// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee
/// saved registers and returns true if it isn't possible / profitable to do
/// so by issuing a series of load instructions via loadRegToStackSlot().
Expand All @@ -284,6 +292,15 @@ class TargetFrameLowering {
return false;
}

// restoreCalleeSavedRegister - Default implementation for restoring a single
// callee saved register. Should be called in reverse order. Can insert
// multiple instructions.
void restoreCalleeSavedRegister(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const CalleeSavedInfo &CS,
const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) const;

/// hasFP - Return true if the specified function should have a dedicated
/// frame pointer register. For most targets this is true only if the function
/// has variable sized allocas or if frame pointer elimination is disabled.
Expand Down
35 changes: 6 additions & 29 deletions llvm/lib/CodeGen/PrologEpilogInserter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -476,8 +476,8 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
// Now that we know which registers need to be saved and restored, allocate
// stack slots for them.
for (auto &CS : CSI) {
// If the target has spilled this register to another register, we don't
// need to allocate a stack slot.
// If the target has spilled this register to another register or already
// handled it , we don't need to allocate a stack slot.
if (CS.isSpilledToReg())
continue;

Expand Down Expand Up @@ -597,25 +597,14 @@ static void updateLiveness(MachineFunction &MF) {
static void insertCSRSaves(MachineBasicBlock &SaveBlock,
ArrayRef<CalleeSavedInfo> CSI) {
MachineFunction &MF = *SaveBlock.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();

MachineBasicBlock::iterator I = SaveBlock.begin();
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CS : CSI) {
// Insert the spill to the stack frame.
MCRegister Reg = CS.getReg();

if (CS.isSpilledToReg()) {
BuildMI(SaveBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY),
CS.getDstReg())
.addReg(Reg, getKillRegState(true));
} else {
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
TRI, Register());
}
TFI->spillCalleeSavedRegister(SaveBlock, I, CS, TII, TRI);
}
}
}
Expand All @@ -624,7 +613,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
std::vector<CalleeSavedInfo> &CSI) {
MachineFunction &MF = *RestoreBlock.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();

Expand All @@ -634,19 +623,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,

if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CI : reverse(CSI)) {
MCRegister Reg = CI.getReg();
if (CI.isSpilledToReg()) {
BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg)
.addReg(CI.getDstReg(), getKillRegState(true));
} else {
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC,
TRI, Register());
assert(I != RestoreBlock.begin() &&
"loadRegFromStackSlot didn't insert any code!");
// Insert in reverse order. loadRegFromStackSlot can insert
// multiple instructions.
}
TFI->restoreCalleeSavedRegister(RestoreBlock, I, CI, TII, TRI);
}
}
}
Expand Down
35 changes: 35 additions & 0 deletions llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
Expand Down Expand Up @@ -182,3 +183,37 @@ TargetFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
return DwarfFrameBase{DwarfFrameBase::Register, {RI->getFrameRegister(MF).id()}};
}

void TargetFrameLowering::spillCalleeSavedRegister(
MachineBasicBlock &SaveBlock, MachineBasicBlock::iterator MI,
const CalleeSavedInfo &CS, const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) const {
// Insert the spill to the stack frame.
MCRegister Reg = CS.getReg();

if (CS.isSpilledToReg()) {
BuildMI(SaveBlock, MI, DebugLoc(), TII->get(TargetOpcode::COPY),
CS.getDstReg())
.addReg(Reg, getKillRegState(true));
} else {
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
TII->storeRegToStackSlot(SaveBlock, MI, Reg, true, CS.getFrameIdx(), RC,
TRI, Register());
}
}

void TargetFrameLowering::restoreCalleeSavedRegister(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const CalleeSavedInfo &CS, const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) const {
MCRegister Reg = CS.getReg();
if (CS.isSpilledToReg()) {
BuildMI(MBB, MI, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
.addReg(CS.getDstReg(), getKillRegState(true));
} else {
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
Register());
assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
}
}
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1275,6 +1275,14 @@ def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32
"Use a block size of 32 for dynamic VGPR allocation (default is 16)"
>;

// Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
// restoring the callee-saved registers.
def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
"UseBlockVGPROpsForCSR",
"true",
"Use block load/store for VGPR callee saved registers"
>;

def FeatureLshlAddU64Inst
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
"Has v_lshl_add_u64 instruction">;
Expand Down
37 changes: 37 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCExpr.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/IR/Constants.h"
Expand Down Expand Up @@ -243,6 +244,36 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV,
return AsmPrinter::lowerConstant(CV, BaseCV, Offset);
}

static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
const TargetRegisterInfo *TRI,
const SIMachineFunctionInfo *MFI,
MCStreamer &OS) {
// The instruction will only transfer a subset of the registers in the block,
// based on the mask that is stored in m0. We could search for the instruction
// that sets m0, but most of the time we'll already have the mask stored in
// the machine function info. Try to use that. This assumes that we only use
// block loads/stores for CSR spills.
Register RegBlock =
TII->getNamedOperand(*MI, MI->mayLoad() ? AMDGPU::OpName::vdst
: AMDGPU::OpName::vdata)
->getReg();
Register FirstRegInBlock = TRI->getSubReg(RegBlock, AMDGPU::sub0);
uint32_t Mask = MFI->getMaskForVGPRBlockOps(RegBlock);

if (!Mask)
return; // Nothing to report

SmallString<512> TransferredRegs;
for (unsigned I = 0; I < sizeof(Mask) * 8; ++I) {
if (Mask & (1 << I)) {
(llvm::Twine(" ") + TRI->getRegAsmName(FirstRegInBlock + I))
.toVector(TransferredRegs);
}
}

OS.emitRawComment(" transferring at most " + TransferredRegs);
}

void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
// FIXME: Enable feature predicate checks once all the test pass.
// AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
Expand Down Expand Up @@ -331,6 +362,12 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}

if (isVerbose())
if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode()))
emitVGPRBlockComment(MI, STI.getInstrInfo(), STI.getRegisterInfo(),
MF->getInfo<SIMachineFunctionInfo>(),
*OutStreamer);

MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasPointSampleAccel = false;

bool RequiresCOV6 = false;
bool UseBlockVGPROpsForCSR = false;

// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
Expand Down Expand Up @@ -1277,6 +1278,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool requiresCodeObjectV6() const { return RequiresCOV6; }

bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }

bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }

bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
Expand Down
Loading
Loading