Skip to content

Commit 479d15a

Browse files
committed
[AMDGPU] Support preloading hidden kernel arguments
Adds hidden kernel arguments to the function signature and marks them inreg if they should be preloaded into user SGPRs. The normal kernarg preloading logic then takes over with some additional checks for the correct implicitarg_ptr alignment. Special care is needed so that metadata for the hidden arguments is not added twice when generating the code object.
1 parent 40e8e4d commit 479d15a

8 files changed

+1061
-7
lines changed

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,14 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
260260
auto &Func = MF.getFunction();
261261
unsigned Offset = 0;
262262
auto Args = HSAMetadataDoc->getArrayNode();
263-
for (auto &Arg : Func.args())
263+
for (auto &Arg : Func.args()) {
264+
if (Func.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
265+
Arg.getArgNo(),
266+
"amdgpu-hidden-argument"))
267+
continue;
268+
264269
emitKernelArg(Arg, Offset, Args);
270+
}
265271

266272
emitHiddenKernelArgs(MF, Offset, Args);
267273

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 197 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
#include "AMDGPU.h"
1515
#include "GCNSubtarget.h"
16+
#include "llvm/ADT/StringExtras.h"
17+
#include "llvm/Analysis/ValueTracking.h"
1618
#include "llvm/CodeGen/TargetPassConfig.h"
1719
#include "llvm/IR/IRBuilder.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,109 @@ class PreloadKernelArgInfo {
3133
const GCNSubtarget &ST;
3234
unsigned NumFreeUserSGPRs;
3335

34-
public:
35-
SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
36+
enum HiddenArg : unsigned {
37+
HIDDEN_BLOCK_COUNT_X,
38+
HIDDEN_BLOCK_COUNT_Y,
39+
HIDDEN_BLOCK_COUNT_Z,
40+
HIDDEN_GROUP_SIZE_X,
41+
HIDDEN_GROUP_SIZE_Y,
42+
HIDDEN_GROUP_SIZE_Z,
43+
HIDDEN_REMAINDER_X,
44+
HIDDEN_REMAINDER_Y,
45+
HIDDEN_REMAINDER_Z,
46+
END_HIDDEN_ARGS
47+
};
48+
49+
// Stores information about a specific hidden argument.
50+
struct HiddenArgInfo {
51+
// Offset in bytes from the location in the kernearg segment pointed to by
52+
// the implicitarg pointer.
53+
uint8_t Offset;
54+
// The size of the hidden argument in bytes.
55+
uint8_t Size;
56+
// The name of the hidden argument in the kernel signature.
57+
const char *Name;
58+
};
59+
60+
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61+
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
62+
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
63+
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
64+
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
65+
{22, 2, "_hidden_remainder_z"}};
66+
67+
static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
68+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
69+
if (HiddenArgs[I].Offset == Offset)
70+
return static_cast<HiddenArg>(I);
71+
72+
return END_HIDDEN_ARGS;
73+
}
74+
75+
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
76+
if (HA < END_HIDDEN_ARGS)
77+
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
78+
79+
llvm_unreachable("Unexpected hidden argument.");
80+
}
81+
82+
static const char *getHiddenArgName(HiddenArg HA) {
83+
if (HA < END_HIDDEN_ARGS) {
84+
return HiddenArgs[HA].Name;
85+
}
86+
llvm_unreachable("Unexpected hidden argument.");
87+
}
88+
89+
// Clones the function after adding implicit arguments to the argument list
90+
// and returns the new updated function. Preloaded implicit arguments are
91+
// added up to and including the last one that will be preloaded, indicated by
92+
// LastPreloadIndex. Currently preloading is only performed on the totality of
93+
// sequential data from the kernarg segment including implicit (hidden)
94+
// arguments. This means that all arguments up to the last preloaded argument
95+
// will also be preloaded even if that data is unused.
96+
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
97+
FunctionType *FT = F.getFunctionType();
98+
LLVMContext &Ctx = F.getParent()->getContext();
99+
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
100+
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
101+
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
102+
103+
FunctionType *NFT =
104+
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
105+
Function *NF =
106+
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
107+
108+
NF->copyAttributesFrom(&F);
109+
NF->copyMetadata(&F, 0);
110+
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
111+
112+
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
113+
NF->takeName(&F);
114+
NF->splice(NF->begin(), &F);
115+
116+
Function::arg_iterator NFArg = NF->arg_begin();
117+
for (Argument &Arg : F.args()) {
118+
Arg.replaceAllUsesWith(&*NFArg);
119+
NFArg->takeName(&Arg);
120+
++NFArg;
121+
}
122+
123+
AttrBuilder AB(Ctx);
124+
AB.addAttribute(Attribute::InReg);
125+
AB.addAttribute("amdgpu-hidden-argument");
126+
AttributeList AL = NF->getAttributes();
127+
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
128+
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
129+
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
130+
}
131+
132+
NF->setAttributes(AL);
133+
F.replaceAllUsesWith(NF);
134+
135+
return NF;
136+
}
36137

138+
public:
37139
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38140
setInitialFreeUserSGPRsCount();
39141
}
@@ -64,6 +166,91 @@ class PreloadKernelArgInfo {
64166
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65167
return true;
66168
}
169+
170+
// Try to allocate SGPRs to preload implicit kernel arguments.
171+
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
172+
IRBuilder<> &Builder) {
173+
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
174+
Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
175+
if (!ImplicitArgPtr)
176+
return;
177+
178+
const DataLayout &DL = F.getParent()->getDataLayout();
179+
// Pair is the load and the load offset.
180+
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
181+
for (auto *U : ImplicitArgPtr->users()) {
182+
Instruction *CI = dyn_cast<Instruction>(U);
183+
if (!CI || CI->getParent()->getParent() != &F)
184+
continue;
185+
186+
for (auto *U : CI->users()) {
187+
int64_t Offset = 0;
188+
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
189+
if (!Load) {
190+
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
191+
continue;
192+
193+
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
194+
}
195+
196+
if (!Load || !Load->isSimple())
197+
continue;
198+
199+
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
200+
LLVMContext &Ctx = F.getParent()->getContext();
201+
Type *LoadTy = Load->getType();
202+
HiddenArg HA = getHiddenArgFromOffset(Offset);
203+
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
204+
continue;
205+
206+
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
207+
}
208+
}
209+
210+
if (ImplicitArgLoads.empty())
211+
return;
212+
213+
// Allocate loads in order of offset. We need to be sure that the implicit
214+
// argument can actually be preloaded.
215+
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
216+
[](const std::pair<LoadInst *, unsigned> &A,
217+
const std::pair<LoadInst *, unsigned> &B) {
218+
return A.second < B.second;
219+
});
220+
221+
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
222+
// If we fail to preload any implicit argument we know we don't have SGPRs
223+
// to preload any subsequent ones with larger offsets. Find the first
224+
// argument that we cannot preload.
225+
auto *PreloadEnd = std::find_if(
226+
ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
227+
[&](const std::pair<LoadInst *, unsigned> &Load) {
228+
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
229+
unsigned LoadOffset = Load.second;
230+
if (!tryAllocPreloadSGPRs(LoadSize,
231+
LoadOffset + ImplicitArgsBaseOffset,
232+
LastExplicitArgOffset))
233+
return true;
234+
235+
LastExplicitArgOffset = LoadOffset + LoadSize;
236+
return false;
237+
});
238+
239+
if (PreloadEnd == ImplicitArgLoads.begin())
240+
return;
241+
242+
unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
243+
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
244+
assert(NF);
245+
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
246+
LoadInst *LoadInst = I->first;
247+
unsigned LoadOffset = I->second;
248+
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
249+
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
250+
Argument *Arg = NF->getArg(Index);
251+
LoadInst->replaceAllUsesWith(Arg);
252+
}
253+
}
67254
};
68255

69256
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +468,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281468
KernArgSegment->addRetAttr(
282469
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
283470

471+
if (InPreloadSequence) {
472+
uint64_t ImplicitArgsBaseOffset =
473+
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
474+
BaseOffset;
475+
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
476+
Builder);
477+
}
478+
284479
return true;
285480
}
286481

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,11 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
314314
MaxAlign = Align(1);
315315

316316
for (const Argument &Arg : F.args()) {
317+
if (F.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
318+
Arg.getArgNo(),
319+
"amdgpu-hidden-argument"))
320+
continue;
321+
317322
const bool IsByRef = Arg.hasByRefAttr();
318323
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
319324
Align Alignment = DL.getValueOrABITypeAlignment(

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2512,19 +2512,20 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25122512
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
25132513
bool InPreloadSequence = true;
25142514
unsigned InIdx = 0;
2515+
bool AlignedForImplictArgs = false;
25152516
for (auto &Arg : F.args()) {
25162517
if (!InPreloadSequence || !Arg.hasInRegAttr())
25172518
break;
25182519

2519-
int ArgIdx = Arg.getArgNo();
2520+
unsigned ArgIdx = Arg.getArgNo();
25202521
// Don't preload non-original args or parts not in the current preload
25212522
// sequence.
2522-
if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2523-
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2523+
if (InIdx < Ins.size() &&
2524+
(!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
25242525
break;
25252526

25262527
for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2527-
(int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2528+
Ins[InIdx].getOrigArgIndex() == ArgIdx;
25282529
InIdx++) {
25292530
assert(ArgLocs[ArgIdx].isMemLoc());
25302531
auto &ArgLoc = ArgLocs[InIdx];
@@ -2534,6 +2535,23 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25342535
unsigned NumAllocSGPRs =
25352536
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
25362537

2538+
// Add padding SPGR to fix alignment for hidden arguments.
2539+
if (!AlignedForImplictArgs &&
2540+
F.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
2541+
Arg.getArgNo(),
2542+
"amdgpu-hidden-argument")) {
2543+
unsigned OffsetBefore = LastExplicitArgOffset;
2544+
LastExplicitArgOffset = alignTo(
2545+
LastExplicitArgOffset, Subtarget->getAlignmentForImplicitArgPtr());
2546+
if (OffsetBefore != LastExplicitArgOffset) {
2547+
unsigned PaddingSGPRs =
2548+
alignTo(LastExplicitArgOffset - OffsetBefore, 4) / 4;
2549+
Info.allocateUserSGPRs(PaddingSGPRs);
2550+
ArgOffset += PaddingSGPRs * 4;
2551+
}
2552+
AlignedForImplictArgs = true;
2553+
}
2554+
25372555
// Arg is preloaded into the previous SGPR.
25382556
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
25392557
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,14 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
277277
return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
278278
}
279279

280+
bool SIMachineFunctionInfo::allocateUserSGPRs(unsigned Number) {
281+
if (Number <= getNumUserSGPRs())
282+
return false;
283+
284+
NumUserSGPRs = Number;
285+
return true;
286+
}
287+
280288
void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
281289
uint64_t Size, Align Alignment) {
282290
// Skip if it is an entry function or the register is already added.

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
769769
unsigned AllocSizeDWord, int KernArgIdx,
770770
int PaddingSGPRs);
771771

772+
/// Reserve up to \p Number of user SGPRs.
773+
bool allocateUserSGPRs(unsigned Number);
774+
772775
/// Increment user SGPRs used for padding the argument list only.
773776
Register addReservedUserSGPR() {
774777
Register Next = getNextUserSGPR();

0 commit comments

Comments
 (0)