Skip to content

Commit 550a6f3

Browse files
committed
[AMDGPU] Support preloading hidden kernel arguments
Adds hidden kernel arguments to the function signature and marks them inreg if they should be preloaded into user SGPRs. The normal kernarg preloading logic then takes over with some additional checks for the correct implicitarg_ptr alignment. Special care is needed so that metadata for the hidden arguments is not added twice when generating the code object.
1 parent 587308c commit 550a6f3

8 files changed

+944
-41
lines changed

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,9 @@ void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
283283
else if (Arg.hasName())
284284
Name = Arg.getName();
285285

286+
if (Name.starts_with("_hidden"))
287+
return;
288+
286289
StringRef TypeName;
287290
Node = Func->getMetadata("kernel_arg_type");
288291
if (Node && ArgNo < Node->getNumOperands())

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 178 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
#include "AMDGPU.h"
1515
#include "GCNSubtarget.h"
16+
#include "llvm/Analysis/ValueTracking.h"
1617
#include "llvm/CodeGen/TargetPassConfig.h"
1718
#include "llvm/IR/IRBuilder.h"
1819
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +32,88 @@ class PreloadKernelArgInfo {
3132
const GCNSubtarget &ST;
3233
unsigned NumFreeUserSGPRs;
3334

34-
public:
35-
SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
35+
enum HiddenArg : unsigned {
36+
HIDDEN_BLOCK_COUNT_X,
37+
HIDDEN_BLOCK_COUNT_Y,
38+
HIDDEN_BLOCK_COUNT_Z,
39+
HIDDEN_GROUP_SIZE_X,
40+
HIDDEN_GROUP_SIZE_Y,
41+
HIDDEN_GROUP_SIZE_Z,
42+
HIDDEN_REMAINDER_X,
43+
HIDDEN_REMAINDER_Y,
44+
HIDDEN_REMAINDER_Z,
45+
END_HIDDEN_ARGS
46+
};
47+
48+
struct HiddenArgInfo {
49+
unsigned Offset;
50+
unsigned Size;
51+
const char *Name;
52+
};
53+
54+
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
55+
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
56+
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
57+
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
58+
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
59+
{22, 2, "_hidden_remainder_z"}};
60+
61+
static HiddenArg getHiddenArgIndexFromOffset(unsigned Offset) {
62+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
63+
if (HiddenArgs[I].Offset == Offset)
64+
return static_cast<HiddenArg>(I);
65+
66+
llvm_unreachable("Unexpected hidden argument offset.");
67+
}
68+
69+
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
70+
if (HA < END_HIDDEN_ARGS)
71+
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
72+
73+
llvm_unreachable("Unexpected hidden argument.");
74+
}
75+
76+
static const char *getHiddenArgName(HiddenArg HA) {
77+
if (HA < END_HIDDEN_ARGS) {
78+
return HiddenArgs[HA].Name;
79+
}
80+
llvm_unreachable("Unexpected hidden argument.");
81+
}
3682

83+
Function *cloneFunctionWithPreloadImplicitArgs() {
84+
FunctionType *FT = F.getFunctionType();
85+
std::vector<Type *> FTypes(FT->param_begin(), FT->param_end());
86+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
87+
FTypes.push_back(getHiddenArgType(F.getContext(), HiddenArg(I)));
88+
89+
FunctionType *NFT =
90+
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
91+
Function *NF =
92+
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
93+
94+
NF->copyAttributesFrom(&F);
95+
NF->copyMetadata(&F, 0);
96+
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
97+
98+
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
99+
NF->takeName(&F);
100+
assert(F.use_empty());
101+
NF->splice(NF->begin(), &F);
102+
103+
Function::arg_iterator NFArg = NF->arg_begin();
104+
for (Argument &Arg : F.args()) {
105+
Arg.replaceAllUsesWith(&*NFArg);
106+
NFArg->takeName(&Arg);
107+
++NFArg;
108+
}
109+
110+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
111+
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
112+
113+
return NF;
114+
}
115+
116+
public:
37117
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38118
setInitialFreeUserSGPRsCount();
39119
}
@@ -64,6 +144,94 @@ class PreloadKernelArgInfo {
64144
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65145
return true;
66146
}
147+
148+
// Try to allocate SGPRs to preload implicit kernel arguments.
149+
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
150+
IRBuilder<> &Builder) {
151+
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
152+
Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
153+
if (!ImplicitArgPtr)
154+
return;
155+
156+
const DataLayout &DL = F.getParent()->getDataLayout();
157+
// Pair is the load and the load offset.
158+
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
159+
for (auto *U : ImplicitArgPtr->users()) {
160+
Instruction *CI = dyn_cast<Instruction>(U);
161+
if (!CI || CI->getParent()->getParent() != &F)
162+
continue;
163+
164+
for (auto *U : CI->users()) {
165+
int64_t Offset = 0;
166+
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
167+
if (!Load) {
168+
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
169+
continue;
170+
171+
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
172+
}
173+
174+
if (!Load || !Load->isSimple())
175+
continue;
176+
177+
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
178+
unsigned LoadSize = Load->getType()->getScalarSizeInBits();
179+
if (LoadSize != 32 && LoadSize != 16)
180+
continue;
181+
182+
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
183+
}
184+
}
185+
186+
if (ImplicitArgLoads.empty())
187+
return;
188+
189+
// Allocate loads in order of offset. We need to be sure that the implicit
190+
// argument can actually be preloaded.
191+
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
192+
[](const std::pair<LoadInst *, unsigned> &A,
193+
const std::pair<LoadInst *, unsigned> &B) {
194+
return A.second < B.second;
195+
});
196+
197+
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
198+
bool AddedHiddenArgsToSignature = false;
199+
Function *NF = nullptr;
200+
unsigned LastPreloadIndex = 0;
201+
for (const auto &Load : ImplicitArgLoads) {
202+
LoadInst *LoadInst = Load.first;
203+
Type *LoadType = LoadInst->getType();
204+
auto LoadOffset = Load.second;
205+
unsigned LoadSize = DL.getTypeStoreSize(LoadType);
206+
// If we fail to preload any implicit argument we know we don't have SGPRs
207+
// to preload any subsequent ones with larger offsets.
208+
if (!tryAllocPreloadSGPRs(LoadSize, LoadOffset + ImplicitArgsBaseOffset,
209+
LastExplicitArgOffset))
210+
break;
211+
212+
if (!AddedHiddenArgsToSignature) {
213+
NF = cloneFunctionWithPreloadImplicitArgs();
214+
AddedHiddenArgsToSignature = true;
215+
}
216+
217+
LastExplicitArgOffset = LoadOffset + LoadSize;
218+
unsigned HiddenArgIndex = getHiddenArgIndexFromOffset(LoadOffset);
219+
assert(NF);
220+
unsigned Index = NF->arg_size() - END_HIDDEN_ARGS + HiddenArgIndex;
221+
Argument *Arg = NF->getArg(Index);
222+
LoadInst->replaceAllUsesWith(Arg);
223+
if (Index > HiddenArgIndex)
224+
LastPreloadIndex = HiddenArgIndex;
225+
}
226+
227+
// Ensure all hidden arguments up to the final preload are also
228+
// preloaded, even if some are unused.
229+
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
230+
NF->getArg(NF->arg_size() - END_HIDDEN_ARGS + I)
231+
->addAttr(Attribute::InReg);
232+
233+
F.removeFromParent();
234+
}
67235
};
68236

69237
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +449,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281449
KernArgSegment->addRetAttr(
282450
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
283451

452+
if (InPreloadSequence) {
453+
uint64_t ImplicitArgsBaseOffset =
454+
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
455+
BaseOffset;
456+
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
457+
Builder);
458+
}
459+
284460
return true;
285461
}
286462

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,8 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
601601
MaxAlign = Align(1);
602602

603603
for (const Argument &Arg : F.args()) {
604+
if (Arg.getName().starts_with("_hidden"))
605+
continue;
604606
const bool IsByRef = Arg.hasByRefAttr();
605607
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
606608
Align Alignment = DL.getValueOrABITypeAlignment(

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2496,6 +2496,7 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
24962496
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
24972497
bool InPreloadSequence = true;
24982498
unsigned InIdx = 0;
2499+
bool AlignedForImplictArgs = false;
24992500
for (auto &Arg : F.args()) {
25002501
if (!InPreloadSequence || !Arg.hasInRegAttr())
25012502
break;
@@ -2518,6 +2519,19 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25182519
unsigned NumAllocSGPRs =
25192520
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
25202521

2522+
if (!AlignedForImplictArgs && Arg.getName().starts_with("_hidden")) {
2523+
unsigned OffsetBefore = LastExplicitArgOffset;
2524+
LastExplicitArgOffset = alignTo(
2525+
LastExplicitArgOffset, Subtarget->getAlignmentForImplicitArgPtr());
2526+
if (OffsetBefore != LastExplicitArgOffset) {
2527+
unsigned PaddingSGPRs =
2528+
alignTo(LastExplicitArgOffset - OffsetBefore, 4) / 4;
2529+
Info.allocateUserSGPRs(PaddingSGPRs);
2530+
ArgOffset += PaddingSGPRs * 4;
2531+
}
2532+
AlignedForImplictArgs = true;
2533+
}
2534+
25212535
// Arg is preloaded into the previous SGPR.
25222536
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
25232537
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,15 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
278278
return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
279279
}
280280

281+
bool SIMachineFunctionInfo::allocateUserSGPRs(
282+
unsigned Number) {
283+
if (Number <= getNumUserSGPRs())
284+
return false;
285+
286+
NumUserSGPRs = Number;
287+
return true;
288+
}
289+
281290
void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
282291
uint64_t Size, Align Alignment) {
283292
// Skip if it is an entry function or the register is already added.

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
760760
unsigned AllocSizeDWord, int KernArgIdx,
761761
int PaddingSGPRs);
762762

763+
/// Reserve up to \p Number of user SGPRs.
764+
bool allocateUserSGPRs(unsigned Number);
765+
763766
/// Increment user SGPRs used for padding the argument list only.
764767
Register addReservedUserSGPR() {
765768
Register Next = getNextUserSGPR();

0 commit comments

Comments
 (0)