13
13
14
14
#include " AMDGPU.h"
15
15
#include " GCNSubtarget.h"
16
+ #include " llvm/Analysis/ValueTracking.h"
16
17
#include " llvm/CodeGen/TargetPassConfig.h"
17
18
#include " llvm/IR/IRBuilder.h"
18
19
#include " llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +32,88 @@ class PreloadKernelArgInfo {
31
32
const GCNSubtarget &ST;
32
33
unsigned NumFreeUserSGPRs;
33
34
34
- public:
35
- SmallVector<llvm::Metadata *, 8 > KernelArgMetadata;
35
+ enum HiddenArg : unsigned {
36
+ HIDDEN_BLOCK_COUNT_X,
37
+ HIDDEN_BLOCK_COUNT_Y,
38
+ HIDDEN_BLOCK_COUNT_Z,
39
+ HIDDEN_GROUP_SIZE_X,
40
+ HIDDEN_GROUP_SIZE_Y,
41
+ HIDDEN_GROUP_SIZE_Z,
42
+ HIDDEN_REMAINDER_X,
43
+ HIDDEN_REMAINDER_Y,
44
+ HIDDEN_REMAINDER_Z,
45
+ END_HIDDEN_ARGS
46
+ };
47
+
48
+ struct HiddenArgInfo {
49
+ unsigned Offset;
50
+ unsigned Size ;
51
+ const char *Name;
52
+ };
53
+
54
+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
55
+ {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
56
+ {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
57
+ {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
58
+ {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
59
+ {22 , 2 , " _hidden_remainder_z" }};
60
+
61
+ static HiddenArg getHiddenArgIndexFromOffset (unsigned Offset) {
62
+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
63
+ if (HiddenArgs[I].Offset == Offset)
64
+ return static_cast <HiddenArg>(I);
65
+
66
+ llvm_unreachable (" Unexpected hidden argument offset." );
67
+ }
68
+
69
+ static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
70
+ if (HA < END_HIDDEN_ARGS)
71
+ return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
72
+
73
+ llvm_unreachable (" Unexpected hidden argument." );
74
+ }
75
+
76
+ static const char *getHiddenArgName (HiddenArg HA) {
77
+ if (HA < END_HIDDEN_ARGS) {
78
+ return HiddenArgs[HA].Name ;
79
+ }
80
+ llvm_unreachable (" Unexpected hidden argument." );
81
+ }
36
82
83
+ Function *cloneFunctionWithPreloadImplicitArgs () {
84
+ FunctionType *FT = F.getFunctionType ();
85
+ std::vector<Type *> FTypes (FT->param_begin (), FT->param_end ());
86
+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
87
+ FTypes.push_back (getHiddenArgType (F.getContext (), HiddenArg (I)));
88
+
89
+ FunctionType *NFT =
90
+ FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
91
+ Function *NF =
92
+ Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
93
+
94
+ NF->copyAttributesFrom (&F);
95
+ NF->copyMetadata (&F, 0 );
96
+ NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
97
+
98
+ F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
99
+ NF->takeName (&F);
100
+ assert (F.use_empty ());
101
+ NF->splice (NF->begin (), &F);
102
+
103
+ Function::arg_iterator NFArg = NF->arg_begin ();
104
+ for (Argument &Arg : F.args ()) {
105
+ Arg.replaceAllUsesWith (&*NFArg);
106
+ NFArg->takeName (&Arg);
107
+ ++NFArg;
108
+ }
109
+
110
+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
111
+ NFArg++->setName (getHiddenArgName (HiddenArg (I)));
112
+
113
+ return NF;
114
+ }
115
+
116
+ public:
37
117
PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38
118
setInitialFreeUserSGPRsCount ();
39
119
}
@@ -64,6 +144,94 @@ class PreloadKernelArgInfo {
64
144
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65
145
return true ;
66
146
}
147
+
148
+ // Try to allocate SGPRs to preload implicit kernel arguments.
149
+ void tryAllocImplicitArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
150
+ IRBuilder<> &Builder) {
151
+ StringRef Name = Intrinsic::getName (Intrinsic::amdgcn_implicitarg_ptr);
152
+ Function *ImplicitArgPtr = F.getParent ()->getFunction (Name);
153
+ if (!ImplicitArgPtr)
154
+ return ;
155
+
156
+ const DataLayout &DL = F.getParent ()->getDataLayout ();
157
+ // Pair is the load and the load offset.
158
+ SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
159
+ for (auto *U : ImplicitArgPtr->users ()) {
160
+ Instruction *CI = dyn_cast<Instruction>(U);
161
+ if (!CI || CI->getParent ()->getParent () != &F)
162
+ continue ;
163
+
164
+ for (auto *U : CI->users ()) {
165
+ int64_t Offset = 0 ;
166
+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
167
+ if (!Load) {
168
+ if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
169
+ continue ;
170
+
171
+ Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
172
+ }
173
+
174
+ if (!Load || !Load->isSimple ())
175
+ continue ;
176
+
177
+ // FIXME: Expand to handle 64-bit implicit args and large merged loads.
178
+ unsigned LoadSize = Load->getType ()->getScalarSizeInBits ();
179
+ if (LoadSize != 32 && LoadSize != 16 )
180
+ continue ;
181
+
182
+ ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
183
+ }
184
+ }
185
+
186
+ if (ImplicitArgLoads.empty ())
187
+ return ;
188
+
189
+ // Allocate loads in order of offset. We need to be sure that the implicit
190
+ // argument can actually be preloaded.
191
+ std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
192
+ [](const std::pair<LoadInst *, unsigned > &A,
193
+ const std::pair<LoadInst *, unsigned > &B) {
194
+ return A.second < B.second ;
195
+ });
196
+
197
+ uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
198
+ bool AddedHiddenArgsToSignature = false ;
199
+ Function *NF = nullptr ;
200
+ unsigned LastPreloadIndex = 0 ;
201
+ for (const auto &Load : ImplicitArgLoads) {
202
+ LoadInst *LoadInst = Load.first ;
203
+ Type *LoadType = LoadInst->getType ();
204
+ auto LoadOffset = Load.second ;
205
+ unsigned LoadSize = DL.getTypeStoreSize (LoadType);
206
+ // If we fail to preload any implicit argument we know we don't have SGPRs
207
+ // to preload any subsequent ones with larger offsets.
208
+ if (!tryAllocPreloadSGPRs (LoadSize, LoadOffset + ImplicitArgsBaseOffset,
209
+ LastExplicitArgOffset))
210
+ break ;
211
+
212
+ if (!AddedHiddenArgsToSignature) {
213
+ NF = cloneFunctionWithPreloadImplicitArgs ();
214
+ AddedHiddenArgsToSignature = true ;
215
+ }
216
+
217
+ LastExplicitArgOffset = LoadOffset + LoadSize;
218
+ unsigned HiddenArgIndex = getHiddenArgIndexFromOffset (LoadOffset);
219
+ assert (NF);
220
+ unsigned Index = NF->arg_size () - END_HIDDEN_ARGS + HiddenArgIndex;
221
+ Argument *Arg = NF->getArg (Index);
222
+ LoadInst->replaceAllUsesWith (Arg);
223
+ if (Index > HiddenArgIndex)
224
+ LastPreloadIndex = HiddenArgIndex;
225
+ }
226
+
227
+ // Ensure all hidden arguments up to the final preload are also
228
+ // preloaded, even if some are unused.
229
+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
230
+ NF->getArg (NF->arg_size () - END_HIDDEN_ARGS + I)
231
+ ->addAttr (Attribute::InReg);
232
+
233
+ F.removeFromParent ();
234
+ }
67
235
};
68
236
69
237
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +449,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281
449
KernArgSegment->addRetAttr (
282
450
Attribute::getWithAlignment (Ctx, std::max (KernArgBaseAlign, MaxAlign)));
283
451
452
+ if (InPreloadSequence) {
453
+ uint64_t ImplicitArgsBaseOffset =
454
+ alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
455
+ BaseOffset;
456
+ PreloadInfo.tryAllocImplicitArgPreloadSGPRs (ImplicitArgsBaseOffset,
457
+ Builder);
458
+ }
459
+
284
460
return true ;
285
461
}
286
462
0 commit comments