Skip to content

Commit b21e586

Browse files
committed
Reapply "Reland [AMDGPU] Support block load/store for CSR llvm#130013 (llvm#137169)"
This reverts commit c076481.
1 parent d598872 commit b21e586

20 files changed

+1110
-56
lines changed

llvm/include/llvm/CodeGen/MachineFrameInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class CalleeSavedInfo {
6161
MCRegister getReg() const { return Reg; }
6262
int getFrameIdx() const { return FrameIdx; }
6363
MCRegister getDstReg() const { return DstReg; }
64+
void setReg(MCRegister R) { Reg = R; }
6465
void setFrameIdx(int FI) {
6566
FrameIdx = FI;
6667
SpilledToReg = false;

llvm/include/llvm/CodeGen/TargetFrameLowering.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,14 @@ class TargetFrameLowering {
271271
return false;
272272
}
273273

274+
/// spillCalleeSavedRegister - Default implementation for spilling a single
275+
/// callee saved register.
276+
void spillCalleeSavedRegister(MachineBasicBlock &SaveBlock,
277+
MachineBasicBlock::iterator MI,
278+
const CalleeSavedInfo &CS,
279+
const TargetInstrInfo *TII,
280+
const TargetRegisterInfo *TRI) const;
281+
274282
/// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee
275283
/// saved registers and returns true if it isn't possible / profitable to do
276284
/// so by issuing a series of load instructions via loadRegToStackSlot().
@@ -285,6 +293,15 @@ class TargetFrameLowering {
285293
return false;
286294
}
287295

296+
// restoreCalleeSavedRegister - Default implementation for restoring a single
297+
// callee saved register. Should be called in reverse order. Can insert
298+
// multiple instructions.
299+
void restoreCalleeSavedRegister(MachineBasicBlock &MBB,
300+
MachineBasicBlock::iterator MI,
301+
const CalleeSavedInfo &CS,
302+
const TargetInstrInfo *TII,
303+
const TargetRegisterInfo *TRI) const;
304+
288305
/// hasFP - Return true if the specified function should have a dedicated
289306
/// frame pointer register. For most targets this is true only if the function
290307
/// has variable sized allocas or if frame pointer elimination is disabled.

llvm/lib/CodeGen/PrologEpilogInserter.cpp

Lines changed: 6 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -476,8 +476,8 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
476476
// Now that we know which registers need to be saved and restored, allocate
477477
// stack slots for them.
478478
for (auto &CS : CSI) {
479-
// If the target has spilled this register to another register, we don't
480-
// need to allocate a stack slot.
479+
// If the target has spilled this register to another register or already
480+
// handled it , we don't need to allocate a stack slot.
481481
if (CS.isSpilledToReg())
482482
continue;
483483

@@ -597,25 +597,14 @@ static void updateLiveness(MachineFunction &MF) {
597597
static void insertCSRSaves(MachineBasicBlock &SaveBlock,
598598
ArrayRef<CalleeSavedInfo> CSI) {
599599
MachineFunction &MF = *SaveBlock.getParent();
600-
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
600+
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
601601
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
602602
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
603603

604604
MachineBasicBlock::iterator I = SaveBlock.begin();
605605
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
606606
for (const CalleeSavedInfo &CS : CSI) {
607-
// Insert the spill to the stack frame.
608-
MCRegister Reg = CS.getReg();
609-
610-
if (CS.isSpilledToReg()) {
611-
BuildMI(SaveBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY),
612-
CS.getDstReg())
613-
.addReg(Reg, getKillRegState(true));
614-
} else {
615-
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
616-
TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
617-
TRI, Register());
618-
}
607+
TFI->spillCalleeSavedRegister(SaveBlock, I, CS, TII, TRI);
619608
}
620609
}
621610
}
@@ -624,7 +613,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
624613
static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
625614
std::vector<CalleeSavedInfo> &CSI) {
626615
MachineFunction &MF = *RestoreBlock.getParent();
627-
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
616+
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
628617
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
629618
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
630619

@@ -634,19 +623,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
634623

635624
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
636625
for (const CalleeSavedInfo &CI : reverse(CSI)) {
637-
MCRegister Reg = CI.getReg();
638-
if (CI.isSpilledToReg()) {
639-
BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg)
640-
.addReg(CI.getDstReg(), getKillRegState(true));
641-
} else {
642-
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
643-
TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC,
644-
TRI, Register());
645-
assert(I != RestoreBlock.begin() &&
646-
"loadRegFromStackSlot didn't insert any code!");
647-
// Insert in reverse order. loadRegFromStackSlot can insert
648-
// multiple instructions.
649-
}
626+
TFI->restoreCalleeSavedRegister(RestoreBlock, I, CI, TII, TRI);
650627
}
651628
}
652629
}

llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "llvm/CodeGen/MachineModuleInfo.h"
1717
#include "llvm/CodeGen/MachineRegisterInfo.h"
1818
#include "llvm/CodeGen/TargetFrameLowering.h"
19+
#include "llvm/CodeGen/TargetInstrInfo.h"
1920
#include "llvm/CodeGen/TargetSubtargetInfo.h"
2021
#include "llvm/IR/Attributes.h"
2122
#include "llvm/IR/CallingConv.h"
@@ -211,3 +212,37 @@ TargetFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
211212
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
212213
return DwarfFrameBase{DwarfFrameBase::Register, {RI->getFrameRegister(MF).id()}};
213214
}
215+
216+
void TargetFrameLowering::spillCalleeSavedRegister(
217+
MachineBasicBlock &SaveBlock, MachineBasicBlock::iterator MI,
218+
const CalleeSavedInfo &CS, const TargetInstrInfo *TII,
219+
const TargetRegisterInfo *TRI) const {
220+
// Insert the spill to the stack frame.
221+
MCRegister Reg = CS.getReg();
222+
223+
if (CS.isSpilledToReg()) {
224+
BuildMI(SaveBlock, MI, DebugLoc(), TII->get(TargetOpcode::COPY),
225+
CS.getDstReg())
226+
.addReg(Reg, getKillRegState(true));
227+
} else {
228+
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
229+
TII->storeRegToStackSlot(SaveBlock, MI, Reg, true, CS.getFrameIdx(), RC,
230+
TRI, Register());
231+
}
232+
}
233+
234+
void TargetFrameLowering::restoreCalleeSavedRegister(
235+
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
236+
const CalleeSavedInfo &CS, const TargetInstrInfo *TII,
237+
const TargetRegisterInfo *TRI) const {
238+
MCRegister Reg = CS.getReg();
239+
if (CS.isSpilledToReg()) {
240+
BuildMI(MBB, MI, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
241+
.addReg(CS.getDstReg(), getKillRegState(true));
242+
} else {
243+
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
244+
TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
245+
Register());
246+
assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
247+
}
248+
}

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,6 +1275,14 @@ def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32
12751275
"Use a block size of 32 for dynamic VGPR allocation (default is 16)"
12761276
>;
12771277

1278+
// Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
1279+
// restoring the callee-saved registers.
1280+
def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
1281+
"UseBlockVGPROpsForCSR",
1282+
"true",
1283+
"Use block load/store for VGPR callee saved registers"
1284+
>;
1285+
12781286
def FeatureLshlAddU64Inst
12791287
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
12801288
"Has v_lshl_add_u64 instruction">;

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "MCTargetDesc/AMDGPUInstPrinter.h"
2020
#include "MCTargetDesc/AMDGPUMCExpr.h"
2121
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22+
#include "SIMachineFunctionInfo.h"
2223
#include "llvm/CodeGen/MachineBasicBlock.h"
2324
#include "llvm/CodeGen/MachineInstr.h"
2425
#include "llvm/IR/Constants.h"
@@ -243,6 +244,36 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV,
243244
return AsmPrinter::lowerConstant(CV, BaseCV, Offset);
244245
}
245246

247+
static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
248+
const TargetRegisterInfo *TRI,
249+
const SIMachineFunctionInfo *MFI,
250+
MCStreamer &OS) {
251+
// The instruction will only transfer a subset of the registers in the block,
252+
// based on the mask that is stored in m0. We could search for the instruction
253+
// that sets m0, but most of the time we'll already have the mask stored in
254+
// the machine function info. Try to use that. This assumes that we only use
255+
// block loads/stores for CSR spills.
256+
Register RegBlock =
257+
TII->getNamedOperand(*MI, MI->mayLoad() ? AMDGPU::OpName::vdst
258+
: AMDGPU::OpName::vdata)
259+
->getReg();
260+
Register FirstRegInBlock = TRI->getSubReg(RegBlock, AMDGPU::sub0);
261+
uint32_t Mask = MFI->getMaskForVGPRBlockOps(RegBlock);
262+
263+
if (!Mask)
264+
return; // Nothing to report
265+
266+
SmallString<512> TransferredRegs;
267+
for (unsigned I = 0; I < sizeof(Mask) * 8; ++I) {
268+
if (Mask & (1 << I)) {
269+
(llvm::Twine(" ") + TRI->getRegAsmName(FirstRegInBlock + I))
270+
.toVector(TransferredRegs);
271+
}
272+
}
273+
274+
OS.emitRawComment(" transferring at most " + TransferredRegs);
275+
}
276+
246277
void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
247278
switch (MI->getOpcode()) {
248279
case TargetOpcode::DBG_VALUE:
@@ -338,6 +369,12 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
338369
return;
339370
}
340371

372+
if (isVerbose())
373+
if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode()))
374+
emitVGPRBlockComment(MI, STI.getInstrInfo(), STI.getRegisterInfo(),
375+
MF->getInfo<SIMachineFunctionInfo>(),
376+
*OutStreamer);
377+
341378
MCInst TmpInst;
342379
MCInstLowering.lower(MI, TmpInst);
343380
EmitToStreamer(*OutStreamer, TmpInst);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
262262
bool HasPointSampleAccel = false;
263263

264264
bool RequiresCOV6 = false;
265+
bool UseBlockVGPROpsForCSR = false;
265266

266267
// Dummy feature to use for assembler in tablegen.
267268
bool FeatureDisable = false;
@@ -1279,6 +1280,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12791280

12801281
bool requiresCodeObjectV6() const { return RequiresCOV6; }
12811282

1283+
bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
1284+
12821285
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
12831286

12841287
bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }

0 commit comments

Comments
 (0)