Skip to content

Commit 5bad5d8

Browse files
authored
Reland [AMDGPU] Support block load/store for CSR llvm#130013 (llvm#137169)
Add support for using the existing SCRATCH_STORE_BLOCK and SCRATCH_LOAD_BLOCK instructions for saving and restoring callee-saved VGPRs. This is controlled by a new subtarget feature, block-vgpr-csr. It does not include WWM registers - those will be saved and restored individually, just like before. This patch does not change the ABI. Use of this feature may lead to slightly increased stack usage, because the memory is not compacted if certain registers don't have to be transferred (this will happen in practice for calling conventions where the callee and caller saved registers are interleaved in groups of 8). However, if the registers at the end of the block of 32 don't have to be transferred, we don't need to use a whole 128-byte stack slot - we can trim some space off the end of the range. In order to implement this feature, we need to rely less on the target-independent code in the PrologEpilogInserter, so we override several new methods in SIFrameLowering. We also add new pseudos, SI_BLOCK_SPILL_V1024_SAVE/RESTORE. One peculiarity is that both the SI_BLOCK_V1024_RESTORE pseudo and the SCRATCH_LOAD_BLOCK instructions will have all the registers that are not transferred added as implicit uses. This is done in order to inform LiveRegUnits that those registers are not available before the restore (since we're not really restoring them - so we can't afford to scavenge them). Unfortunately, this trick doesn't work with the save, so before the save all the registers in the block will be unavailable (see the unit test). This was reverted due to failures in the builds with expensive checks on, now fixed by always updating LiveIntervals and SlotIndexes in SILowerSGPRSpills.
1 parent 211b51e commit 5bad5d8

20 files changed

+1088
-50
lines changed

llvm/include/llvm/CodeGen/MachineFrameInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class CalleeSavedInfo {
6161
MCRegister getReg() const { return Reg; }
6262
int getFrameIdx() const { return FrameIdx; }
6363
MCRegister getDstReg() const { return DstReg; }
64+
void setReg(MCRegister R) { Reg = R; }
6465
void setFrameIdx(int FI) {
6566
FrameIdx = FI;
6667
SpilledToReg = false;

llvm/include/llvm/CodeGen/TargetFrameLowering.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,14 @@ class TargetFrameLowering {
270270
return false;
271271
}
272272

273+
/// spillCalleeSavedRegister - Default implementation for spilling a single
274+
/// callee saved register.
275+
void spillCalleeSavedRegister(MachineBasicBlock &SaveBlock,
276+
MachineBasicBlock::iterator MI,
277+
const CalleeSavedInfo &CS,
278+
const TargetInstrInfo *TII,
279+
const TargetRegisterInfo *TRI) const;
280+
273281
/// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee
274282
/// saved registers and returns true if it isn't possible / profitable to do
275283
/// so by issuing a series of load instructions via loadRegToStackSlot().
@@ -284,6 +292,15 @@ class TargetFrameLowering {
284292
return false;
285293
}
286294

295+
// restoreCalleeSavedRegister - Default implementation for restoring a single
296+
// callee saved register. Should be called in reverse order. Can insert
297+
// multiple instructions.
298+
void restoreCalleeSavedRegister(MachineBasicBlock &MBB,
299+
MachineBasicBlock::iterator MI,
300+
const CalleeSavedInfo &CS,
301+
const TargetInstrInfo *TII,
302+
const TargetRegisterInfo *TRI) const;
303+
287304
/// hasFP - Return true if the specified function should have a dedicated
288305
/// frame pointer register. For most targets this is true only if the function
289306
/// has variable sized allocas or if frame pointer elimination is disabled.

llvm/lib/CodeGen/PrologEpilogInserter.cpp

Lines changed: 6 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -476,8 +476,8 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
476476
// Now that we know which registers need to be saved and restored, allocate
477477
// stack slots for them.
478478
for (auto &CS : CSI) {
479-
// If the target has spilled this register to another register, we don't
480-
// need to allocate a stack slot.
479+
// If the target has spilled this register to another register or already
480+
// handled it , we don't need to allocate a stack slot.
481481
if (CS.isSpilledToReg())
482482
continue;
483483

@@ -597,25 +597,14 @@ static void updateLiveness(MachineFunction &MF) {
597597
static void insertCSRSaves(MachineBasicBlock &SaveBlock,
598598
ArrayRef<CalleeSavedInfo> CSI) {
599599
MachineFunction &MF = *SaveBlock.getParent();
600-
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
600+
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
601601
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
602602
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
603603

604604
MachineBasicBlock::iterator I = SaveBlock.begin();
605605
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
606606
for (const CalleeSavedInfo &CS : CSI) {
607-
// Insert the spill to the stack frame.
608-
MCRegister Reg = CS.getReg();
609-
610-
if (CS.isSpilledToReg()) {
611-
BuildMI(SaveBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY),
612-
CS.getDstReg())
613-
.addReg(Reg, getKillRegState(true));
614-
} else {
615-
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
616-
TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
617-
TRI, Register());
618-
}
607+
TFI->spillCalleeSavedRegister(SaveBlock, I, CS, TII, TRI);
619608
}
620609
}
621610
}
@@ -624,7 +613,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
624613
static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
625614
std::vector<CalleeSavedInfo> &CSI) {
626615
MachineFunction &MF = *RestoreBlock.getParent();
627-
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
616+
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
628617
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
629618
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
630619

@@ -634,19 +623,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
634623

635624
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
636625
for (const CalleeSavedInfo &CI : reverse(CSI)) {
637-
MCRegister Reg = CI.getReg();
638-
if (CI.isSpilledToReg()) {
639-
BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg)
640-
.addReg(CI.getDstReg(), getKillRegState(true));
641-
} else {
642-
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
643-
TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC,
644-
TRI, Register());
645-
assert(I != RestoreBlock.begin() &&
646-
"loadRegFromStackSlot didn't insert any code!");
647-
// Insert in reverse order. loadRegFromStackSlot can insert
648-
// multiple instructions.
649-
}
626+
TFI->restoreCalleeSavedRegister(RestoreBlock, I, CI, TII, TRI);
650627
}
651628
}
652629
}

llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "llvm/CodeGen/MachineFunction.h"
1616
#include "llvm/CodeGen/MachineRegisterInfo.h"
1717
#include "llvm/CodeGen/TargetFrameLowering.h"
18+
#include "llvm/CodeGen/TargetInstrInfo.h"
1819
#include "llvm/CodeGen/TargetSubtargetInfo.h"
1920
#include "llvm/IR/Attributes.h"
2021
#include "llvm/IR/Function.h"
@@ -182,3 +183,37 @@ TargetFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
182183
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
183184
return DwarfFrameBase{DwarfFrameBase::Register, {RI->getFrameRegister(MF).id()}};
184185
}
186+
187+
void TargetFrameLowering::spillCalleeSavedRegister(
188+
MachineBasicBlock &SaveBlock, MachineBasicBlock::iterator MI,
189+
const CalleeSavedInfo &CS, const TargetInstrInfo *TII,
190+
const TargetRegisterInfo *TRI) const {
191+
// Insert the spill to the stack frame.
192+
MCRegister Reg = CS.getReg();
193+
194+
if (CS.isSpilledToReg()) {
195+
BuildMI(SaveBlock, MI, DebugLoc(), TII->get(TargetOpcode::COPY),
196+
CS.getDstReg())
197+
.addReg(Reg, getKillRegState(true));
198+
} else {
199+
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
200+
TII->storeRegToStackSlot(SaveBlock, MI, Reg, true, CS.getFrameIdx(), RC,
201+
TRI, Register());
202+
}
203+
}
204+
205+
void TargetFrameLowering::restoreCalleeSavedRegister(
206+
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
207+
const CalleeSavedInfo &CS, const TargetInstrInfo *TII,
208+
const TargetRegisterInfo *TRI) const {
209+
MCRegister Reg = CS.getReg();
210+
if (CS.isSpilledToReg()) {
211+
BuildMI(MBB, MI, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
212+
.addReg(CS.getDstReg(), getKillRegState(true));
213+
} else {
214+
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
215+
TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
216+
Register());
217+
assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
218+
}
219+
}

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,6 +1275,14 @@ def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32
12751275
"Use a block size of 32 for dynamic VGPR allocation (default is 16)"
12761276
>;
12771277

1278+
// Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
1279+
// restoring the callee-saved registers.
1280+
def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
1281+
"UseBlockVGPROpsForCSR",
1282+
"true",
1283+
"Use block load/store for VGPR callee saved registers"
1284+
>;
1285+
12781286
def FeatureLshlAddU64Inst
12791287
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
12801288
"Has v_lshl_add_u64 instruction">;

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "MCTargetDesc/AMDGPUInstPrinter.h"
2020
#include "MCTargetDesc/AMDGPUMCExpr.h"
2121
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22+
#include "SIMachineFunctionInfo.h"
2223
#include "llvm/CodeGen/MachineBasicBlock.h"
2324
#include "llvm/CodeGen/MachineInstr.h"
2425
#include "llvm/IR/Constants.h"
@@ -243,6 +244,36 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV,
243244
return AsmPrinter::lowerConstant(CV, BaseCV, Offset);
244245
}
245246

247+
static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
248+
const TargetRegisterInfo *TRI,
249+
const SIMachineFunctionInfo *MFI,
250+
MCStreamer &OS) {
251+
// The instruction will only transfer a subset of the registers in the block,
252+
// based on the mask that is stored in m0. We could search for the instruction
253+
// that sets m0, but most of the time we'll already have the mask stored in
254+
// the machine function info. Try to use that. This assumes that we only use
255+
// block loads/stores for CSR spills.
256+
Register RegBlock =
257+
TII->getNamedOperand(*MI, MI->mayLoad() ? AMDGPU::OpName::vdst
258+
: AMDGPU::OpName::vdata)
259+
->getReg();
260+
Register FirstRegInBlock = TRI->getSubReg(RegBlock, AMDGPU::sub0);
261+
uint32_t Mask = MFI->getMaskForVGPRBlockOps(RegBlock);
262+
263+
if (!Mask)
264+
return; // Nothing to report
265+
266+
SmallString<512> TransferredRegs;
267+
for (unsigned I = 0; I < sizeof(Mask) * 8; ++I) {
268+
if (Mask & (1 << I)) {
269+
(llvm::Twine(" ") + TRI->getRegAsmName(FirstRegInBlock + I))
270+
.toVector(TransferredRegs);
271+
}
272+
}
273+
274+
OS.emitRawComment(" transferring at most " + TransferredRegs);
275+
}
276+
246277
void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
247278
// FIXME: Enable feature predicate checks once all the test pass.
248279
// AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
@@ -331,6 +362,12 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
331362
return;
332363
}
333364

365+
if (isVerbose())
366+
if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode()))
367+
emitVGPRBlockComment(MI, STI.getInstrInfo(), STI.getRegisterInfo(),
368+
MF->getInfo<SIMachineFunctionInfo>(),
369+
*OutStreamer);
370+
334371
MCInst TmpInst;
335372
MCInstLowering.lower(MI, TmpInst);
336373
EmitToStreamer(*OutStreamer, TmpInst);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
262262
bool HasPointSampleAccel = false;
263263

264264
bool RequiresCOV6 = false;
265+
bool UseBlockVGPROpsForCSR = false;
265266

266267
// Dummy feature to use for assembler in tablegen.
267268
bool FeatureDisable = false;
@@ -1277,6 +1278,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12771278

12781279
bool requiresCodeObjectV6() const { return RequiresCOV6; }
12791280

1281+
bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
1282+
12801283
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
12811284

12821285
bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }

0 commit comments

Comments
 (0)