Skip to content

Commit 40889af

Browse files
committed
[AMDGPU] Support preloading hidden kernel arguments
Adds hidden kernel arguments to the function signature and marks them inreg if they should be preloaded into user SGPRs. The normal kernarg preloading logic then takes over with some additional checks for the correct implicitarg_ptr alignment. Special care is needed so that metadata for the hidden arguments is not added twice when generating the code object.
1 parent 8e2476e commit 40889af

9 files changed

+2655
-1381
lines changed

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,10 +258,13 @@ void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func,
258258
void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
259259
msgpack::MapDocNode Kern) {
260260
auto &Func = MF.getFunction();
261+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
262+
unsigned HiddenArgOffset = ST.getHiddenArgOffset(MF.getFunction());
261263
unsigned Offset = 0;
262264
auto Args = HSAMetadataDoc->getArrayNode();
263-
for (auto &Arg : Func.args())
264-
emitKernelArg(Arg, Offset, Args);
265+
for (unsigned I = 0; I < Func.arg_size() && I < HiddenArgOffset; ++I) {
266+
emitKernelArg(*Func.getArg(I), Offset, Args);
267+
}
265268

266269
emitHiddenKernelArgs(MF, Offset, Args);
267270

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 184 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
#include "AMDGPU.h"
1515
#include "GCNSubtarget.h"
16+
#include "llvm/ADT/StringExtras.h"
17+
#include "llvm/Analysis/ValueTracking.h"
1618
#include "llvm/CodeGen/TargetPassConfig.h"
1719
#include "llvm/IR/IRBuilder.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,93 @@ class PreloadKernelArgInfo {
3133
const GCNSubtarget &ST;
3234
unsigned NumFreeUserSGPRs;
3335

34-
public:
35-
SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
36+
enum HiddenArg : unsigned {
37+
HIDDEN_BLOCK_COUNT_X,
38+
HIDDEN_BLOCK_COUNT_Y,
39+
HIDDEN_BLOCK_COUNT_Z,
40+
HIDDEN_GROUP_SIZE_X,
41+
HIDDEN_GROUP_SIZE_Y,
42+
HIDDEN_GROUP_SIZE_Z,
43+
HIDDEN_REMAINDER_X,
44+
HIDDEN_REMAINDER_Y,
45+
HIDDEN_REMAINDER_Z,
46+
END_HIDDEN_ARGS
47+
};
48+
49+
struct HiddenArgInfo {
50+
unsigned Offset;
51+
unsigned Size;
52+
const char *Name;
53+
};
54+
55+
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
56+
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
57+
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
58+
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
59+
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
60+
{22, 2, "_hidden_remainder_z"}};
61+
62+
static HiddenArg getHiddenArgIndexFromOffset(unsigned Offset) {
63+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
64+
if (HiddenArgs[I].Offset == Offset)
65+
return static_cast<HiddenArg>(I);
66+
67+
llvm_unreachable("Unexpected hidden argument offset.");
68+
}
69+
70+
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
71+
if (HA < END_HIDDEN_ARGS)
72+
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
73+
74+
llvm_unreachable("Unexpected hidden argument.");
75+
}
3676

77+
static const char *getHiddenArgName(HiddenArg HA) {
78+
if (HA < END_HIDDEN_ARGS) {
79+
return HiddenArgs[HA].Name;
80+
}
81+
llvm_unreachable("Unexpected hidden argument.");
82+
}
83+
84+
Function *cloneFunctionWithPreloadImplicitArgs() {
85+
FunctionType *FT = F.getFunctionType();
86+
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
87+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
88+
FTypes.push_back(getHiddenArgType(F.getContext(), HiddenArg(I)));
89+
90+
FunctionType *NFT =
91+
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
92+
Function *NF =
93+
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
94+
95+
NF->copyAttributesFrom(&F);
96+
NF->copyMetadata(&F, 0);
97+
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
98+
99+
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
100+
NF->takeName(&F);
101+
NF->splice(NF->begin(), &F);
102+
103+
Function::arg_iterator NFArg = NF->arg_begin();
104+
for (Argument &Arg : F.args()) {
105+
Arg.replaceAllUsesWith(&*NFArg);
106+
NFArg->takeName(&Arg);
107+
++NFArg;
108+
}
109+
110+
// Add an attribute that tracks the index offset to the first hidden
111+
// argument.
112+
NF->addFnAttr("amdgpu-hidden-arg-offset", utostr(NFArg->getArgNo()));
113+
114+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
115+
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
116+
117+
F.replaceAllUsesWith(NF);
118+
119+
return NF;
120+
}
121+
122+
public:
37123
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38124
setInitialFreeUserSGPRsCount();
39125
}
@@ -64,6 +150,94 @@ class PreloadKernelArgInfo {
64150
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65151
return true;
66152
}
153+
154+
// Try to allocate SGPRs to preload implicit kernel arguments.
155+
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
156+
IRBuilder<> &Builder) {
157+
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
158+
Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
159+
if (!ImplicitArgPtr)
160+
return;
161+
162+
const DataLayout &DL = F.getParent()->getDataLayout();
163+
// Pair is the load and the load offset.
164+
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
165+
for (auto *U : ImplicitArgPtr->users()) {
166+
Instruction *CI = dyn_cast<Instruction>(U);
167+
if (!CI || CI->getParent()->getParent() != &F)
168+
continue;
169+
170+
for (auto *U : CI->users()) {
171+
int64_t Offset = 0;
172+
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
173+
if (!Load) {
174+
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
175+
continue;
176+
177+
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
178+
}
179+
180+
if (!Load || !Load->isSimple())
181+
continue;
182+
183+
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
184+
unsigned LoadSize = Load->getType()->getScalarSizeInBits();
185+
if (LoadSize != 32 && LoadSize != 16)
186+
continue;
187+
188+
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
189+
}
190+
}
191+
192+
if (ImplicitArgLoads.empty())
193+
return;
194+
195+
// Allocate loads in order of offset. We need to be sure that the implicit
196+
// argument can actually be preloaded.
197+
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
198+
[](const std::pair<LoadInst *, unsigned> &A,
199+
const std::pair<LoadInst *, unsigned> &B) {
200+
return A.second < B.second;
201+
});
202+
203+
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
204+
bool AddedHiddenArgsToSignature = false;
205+
Function *NF = nullptr;
206+
unsigned LastPreloadIndex = 0;
207+
for (const auto &Load : ImplicitArgLoads) {
208+
LoadInst *LoadInst = Load.first;
209+
Type *LoadType = LoadInst->getType();
210+
auto LoadOffset = Load.second;
211+
unsigned LoadSize = DL.getTypeStoreSize(LoadType);
212+
// If we fail to preload any implicit argument we know we don't have SGPRs
213+
// to preload any subsequent ones with larger offsets.
214+
if (!tryAllocPreloadSGPRs(LoadSize, LoadOffset + ImplicitArgsBaseOffset,
215+
LastExplicitArgOffset))
216+
break;
217+
218+
if (!AddedHiddenArgsToSignature) {
219+
NF = cloneFunctionWithPreloadImplicitArgs();
220+
AddedHiddenArgsToSignature = true;
221+
}
222+
223+
LastExplicitArgOffset = LoadOffset + LoadSize;
224+
unsigned HiddenArgIndex = getHiddenArgIndexFromOffset(LoadOffset);
225+
assert(NF);
226+
unsigned Index = NF->arg_size() - END_HIDDEN_ARGS + HiddenArgIndex;
227+
Argument *Arg = NF->getArg(Index);
228+
LoadInst->replaceAllUsesWith(Arg);
229+
if (Index > HiddenArgIndex)
230+
LastPreloadIndex = HiddenArgIndex;
231+
}
232+
233+
// Ensure all hidden arguments up to the final preload are also
234+
// preloaded, even if some are unused.
235+
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
236+
NF->getArg(NF->arg_size() - END_HIDDEN_ARGS + I)
237+
->addAttr(Attribute::InReg);
238+
239+
F.removeFromParent();
240+
}
67241
};
68242

69243
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +455,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281455
KernArgSegment->addRetAttr(
282456
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
283457

458+
if (InPreloadSequence) {
459+
uint64_t ImplicitArgsBaseOffset =
460+
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
461+
BaseOffset;
462+
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
463+
Builder);
464+
}
465+
284466
return true;
285467
}
286468

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -610,7 +610,11 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
610610
uint64_t ExplicitArgBytes = 0;
611611
MaxAlign = Align(1);
612612

613+
unsigned HiddenArgOffset = getHiddenArgOffset(F);
613614
for (const Argument &Arg : F.args()) {
615+
if (Arg.getArgNo() >= HiddenArgOffset)
616+
break;
617+
614618
const bool IsByRef = Arg.hasByRefAttr();
615619
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
616620
Align Alignment = DL.getValueOrABITypeAlignment(
@@ -1035,6 +1039,14 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
10351039
return 3;
10361040
}
10371041

1042+
unsigned AMDGPUSubtarget::getHiddenArgOffset(const Function &F) const {
1043+
if (!F.hasFnAttribute("amdgpu-hidden-arg-offset"))
1044+
return F.arg_size();
1045+
1046+
return F.getFnAttributeAsParsedInteger("amdgpu-amdgpu-hidden-arg-offset",
1047+
F.arg_size());
1048+
}
1049+
10381050
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
10391051
if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
10401052
return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,10 @@ class AMDGPUSubtarget {
303303
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
304304
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
305305

306+
/// \returns an offset index to the first hidden implicit argument in the
307+
/// kernel signature.
308+
unsigned getHiddenArgOffset(const Function &F) const;
309+
306310
/// \returns Corresponding DWARF register number mapping flavour for the
307311
/// \p WavefrontSize.
308312
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2511,19 +2511,20 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25112511
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
25122512
bool InPreloadSequence = true;
25132513
unsigned InIdx = 0;
2514+
bool AlignedForImplictArgs = false;
25142515
for (auto &Arg : F.args()) {
25152516
if (!InPreloadSequence || !Arg.hasInRegAttr())
25162517
break;
25172518

2518-
int ArgIdx = Arg.getArgNo();
2519+
unsigned ArgIdx = Arg.getArgNo();
25192520
// Don't preload non-original args or parts not in the current preload
25202521
// sequence.
2521-
if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2522-
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2522+
if (InIdx < Ins.size() &&
2523+
(!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
25232524
break;
25242525

25252526
for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2526-
(int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2527+
Ins[InIdx].getOrigArgIndex() == ArgIdx;
25272528
InIdx++) {
25282529
assert(ArgLocs[ArgIdx].isMemLoc());
25292530
auto &ArgLoc = ArgLocs[InIdx];
@@ -2533,6 +2534,20 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25332534
unsigned NumAllocSGPRs =
25342535
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
25352536

2537+
if (!AlignedForImplictArgs &&
2538+
ArgIdx == Subtarget->getHiddenArgOffset(F)) {
2539+
unsigned OffsetBefore = LastExplicitArgOffset;
2540+
LastExplicitArgOffset = alignTo(
2541+
LastExplicitArgOffset, Subtarget->getAlignmentForImplicitArgPtr());
2542+
if (OffsetBefore != LastExplicitArgOffset) {
2543+
unsigned PaddingSGPRs =
2544+
alignTo(LastExplicitArgOffset - OffsetBefore, 4) / 4;
2545+
Info.allocateUserSGPRs(PaddingSGPRs);
2546+
ArgOffset += PaddingSGPRs * 4;
2547+
}
2548+
AlignedForImplictArgs = true;
2549+
}
2550+
25362551
// Arg is preloaded into the previous SGPR.
25372552
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
25382553
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,15 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
278278
return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
279279
}
280280

281+
bool SIMachineFunctionInfo::allocateUserSGPRs(
282+
unsigned Number) {
283+
if (Number <= getNumUserSGPRs())
284+
return false;
285+
286+
NumUserSGPRs = Number;
287+
return true;
288+
}
289+
281290
void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
282291
uint64_t Size, Align Alignment) {
283292
// Skip if it is an entry function or the register is already added.

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
760760
unsigned AllocSizeDWord, int KernArgIdx,
761761
int PaddingSGPRs);
762762

763+
/// Reserve up to \p Number of user SGPRs.
764+
bool allocateUserSGPRs(unsigned Number);
765+
763766
/// Increment user SGPRs used for padding the argument list only.
764767
Register addReservedUserSGPR() {
765768
Register Next = getNextUserSGPR();

0 commit comments

Comments
 (0)