Skip to content

Commit 9d9e023

Browse files
committed
[AMDGPU] Enable amdgpu-sw-lower-lds pass to lower LDS accesses to use device global memory. (#87265)
1 parent 3a3aeb8 commit 9d9e023

18 files changed

+2184
-228
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
263263
bool GlobalOpt;
264264
};
265265

266+
void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
267+
extern char &AMDGPUSwLowerLDSLegacyPassID;
268+
ModulePass *createAMDGPUSwLowerLDSLegacyPass();
269+
270+
struct AMDGPUSwLowerLDSPass : PassInfoMixin<AMDGPUSwLowerLDSPass> {
271+
AMDGPUSwLowerLDSPass() {}
272+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
273+
};
274+
266275
class AMDGPUCodeGenPreparePass
267276
: public PassInfoMixin<AMDGPUCodeGenPreparePass> {
268277
private:

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Lines changed: 2 additions & 228 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@
212212
#define DEBUG_TYPE "amdgpu-lower-module-lds"
213213

214214
using namespace llvm;
215+
using namespace AMDGPU;
215216

216217
namespace {
217218

@@ -234,17 +235,6 @@ cl::opt<LoweringKind> LoweringKindLoc(
234235
clEnumValN(LoweringKind::hybrid, "hybrid",
235236
"Lower via mixture of above strategies")));
236237

237-
bool isKernelLDS(const Function *F) {
238-
// Some weirdness here. AMDGPU::isKernelCC does not call into
239-
// AMDGPU::isKernel with the calling conv, it instead calls into
240-
// isModuleEntryFunction which returns true for more calling conventions
241-
// than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
242-
// There's also a test that checks that the LDS lowering does not hit on
243-
// a graphics shader, denoted amdgpu_ps, so stay with the limited case.
244-
// Putting LDS in the name of the function to draw attention to this.
245-
return AMDGPU::isKernel(F->getCallingConv());
246-
}
247-
248238
template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
249239
llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
250240
return L->getName() < R->getName();
@@ -305,183 +295,9 @@ class AMDGPULowerModuleLDS {
305295
Decl, {}, {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
306296
}
307297

308-
static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
309-
// Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
310-
// global may have uses from multiple different functions as a result.
311-
// This pass specialises LDS variables with respect to the kernel that
312-
// allocates them.
313-
314-
// This is semantically equivalent to (the unimplemented as slow):
315-
// for (auto &F : M.functions())
316-
// for (auto &BB : F)
317-
// for (auto &I : BB)
318-
// for (Use &Op : I.operands())
319-
// if (constantExprUsesLDS(Op))
320-
// replaceConstantExprInFunction(I, Op);
321-
322-
SmallVector<Constant *> LDSGlobals;
323-
for (auto &GV : M.globals())
324-
if (AMDGPU::isLDSVariableToLower(GV))
325-
LDSGlobals.push_back(&GV);
326-
327-
return convertUsersOfConstantsToInstructions(LDSGlobals);
328-
}
329-
330298
public:
331299
AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
332300

333-
using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
334-
335-
using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
336-
337-
static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
338-
FunctionVariableMap &kernels,
339-
FunctionVariableMap &functions) {
340-
341-
// Get uses from the current function, excluding uses by called functions
342-
// Two output variables to avoid walking the globals list twice
343-
for (auto &GV : M.globals()) {
344-
if (!AMDGPU::isLDSVariableToLower(GV)) {
345-
continue;
346-
}
347-
348-
for (User *V : GV.users()) {
349-
if (auto *I = dyn_cast<Instruction>(V)) {
350-
Function *F = I->getFunction();
351-
if (isKernelLDS(F)) {
352-
kernels[F].insert(&GV);
353-
} else {
354-
functions[F].insert(&GV);
355-
}
356-
}
357-
}
358-
}
359-
}
360-
361-
struct LDSUsesInfoTy {
362-
FunctionVariableMap direct_access;
363-
FunctionVariableMap indirect_access;
364-
};
365-
366-
static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
367-
368-
FunctionVariableMap direct_map_kernel;
369-
FunctionVariableMap direct_map_function;
370-
getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function);
371-
372-
// Collect variables that are used by functions whose address has escaped
373-
DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
374-
for (Function &F : M.functions()) {
375-
if (!isKernelLDS(&F))
376-
if (F.hasAddressTaken(nullptr,
377-
/* IgnoreCallbackUses */ false,
378-
/* IgnoreAssumeLikeCalls */ false,
379-
/* IgnoreLLVMUsed */ true,
380-
/* IgnoreArcAttachedCall */ false)) {
381-
set_union(VariablesReachableThroughFunctionPointer,
382-
direct_map_function[&F]);
383-
}
384-
}
385-
386-
auto functionMakesUnknownCall = [&](const Function *F) -> bool {
387-
assert(!F->isDeclaration());
388-
for (const CallGraphNode::CallRecord &R : *CG[F]) {
389-
if (!R.second->getFunction()) {
390-
return true;
391-
}
392-
}
393-
return false;
394-
};
395-
396-
// Work out which variables are reachable through function calls
397-
FunctionVariableMap transitive_map_function = direct_map_function;
398-
399-
// If the function makes any unknown call, assume the worst case that it can
400-
// access all variables accessed by functions whose address escaped
401-
for (Function &F : M.functions()) {
402-
if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
403-
if (!isKernelLDS(&F)) {
404-
set_union(transitive_map_function[&F],
405-
VariablesReachableThroughFunctionPointer);
406-
}
407-
}
408-
}
409-
410-
// Direct implementation of collecting all variables reachable from each
411-
// function
412-
for (Function &Func : M.functions()) {
413-
if (Func.isDeclaration() || isKernelLDS(&Func))
414-
continue;
415-
416-
DenseSet<Function *> seen; // catches cycles
417-
SmallVector<Function *, 4> wip{&Func};
418-
419-
while (!wip.empty()) {
420-
Function *F = wip.pop_back_val();
421-
422-
// Can accelerate this by referring to transitive map for functions that
423-
// have already been computed, with more care than this
424-
set_union(transitive_map_function[&Func], direct_map_function[F]);
425-
426-
for (const CallGraphNode::CallRecord &R : *CG[F]) {
427-
Function *ith = R.second->getFunction();
428-
if (ith) {
429-
if (!seen.contains(ith)) {
430-
seen.insert(ith);
431-
wip.push_back(ith);
432-
}
433-
}
434-
}
435-
}
436-
}
437-
438-
// direct_map_kernel lists which variables are used by the kernel
439-
// find the variables which are used through a function call
440-
FunctionVariableMap indirect_map_kernel;
441-
442-
for (Function &Func : M.functions()) {
443-
if (Func.isDeclaration() || !isKernelLDS(&Func))
444-
continue;
445-
446-
for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
447-
Function *ith = R.second->getFunction();
448-
if (ith) {
449-
set_union(indirect_map_kernel[&Func], transitive_map_function[ith]);
450-
} else {
451-
set_union(indirect_map_kernel[&Func],
452-
VariablesReachableThroughFunctionPointer);
453-
}
454-
}
455-
}
456-
457-
// Verify that we fall into one of 2 cases:
458-
// - All variables are absolute: this is a re-run of the pass
459-
// so we don't have anything to do.
460-
// - No variables are absolute.
461-
std::optional<bool> HasAbsoluteGVs;
462-
for (auto &Map : {direct_map_kernel, indirect_map_kernel}) {
463-
for (auto &[Fn, GVs] : Map) {
464-
for (auto *GV : GVs) {
465-
bool IsAbsolute = GV->isAbsoluteSymbolRef();
466-
if (HasAbsoluteGVs.has_value()) {
467-
if (*HasAbsoluteGVs != IsAbsolute) {
468-
report_fatal_error(
469-
"Module cannot mix absolute and non-absolute LDS GVs");
470-
}
471-
} else
472-
HasAbsoluteGVs = IsAbsolute;
473-
}
474-
}
475-
}
476-
477-
// If we only had absolute GVs, we have nothing to do, return an empty
478-
// result.
479-
if (HasAbsoluteGVs && *HasAbsoluteGVs)
480-
return {FunctionVariableMap(), FunctionVariableMap()};
481-
482-
return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
483-
}
484-
485301
struct LDSVariableReplacement {
486302
GlobalVariable *SGV = nullptr;
487303
DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
@@ -1046,48 +862,6 @@ class AMDGPULowerModuleLDS {
1046862
return N;
1047863
}
1048864

1049-
/// Strip "amdgpu-no-lds-kernel-id" from any functions where we may have
1050-
/// introduced its use. If AMDGPUAttributor ran prior to the pass, we inferred
1051-
/// the lack of llvm.amdgcn.lds.kernel.id calls.
1052-
void removeNoLdsKernelIdFromReachable(CallGraph &CG, Function *KernelRoot) {
1053-
KernelRoot->removeFnAttr("amdgpu-no-lds-kernel-id");
1054-
1055-
SmallVector<Function *> WorkList({CG[KernelRoot]->getFunction()});
1056-
SmallPtrSet<Function *, 8> Visited;
1057-
bool SeenUnknownCall = false;
1058-
1059-
while (!WorkList.empty()) {
1060-
Function *F = WorkList.pop_back_val();
1061-
1062-
for (auto &CallRecord : *CG[F]) {
1063-
if (!CallRecord.second)
1064-
continue;
1065-
1066-
Function *Callee = CallRecord.second->getFunction();
1067-
if (!Callee) {
1068-
if (!SeenUnknownCall) {
1069-
SeenUnknownCall = true;
1070-
1071-
// If we see any indirect calls, assume nothing about potential
1072-
// targets.
1073-
// TODO: This could be refined to possible LDS global users.
1074-
for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
1075-
Function *PotentialCallee =
1076-
ExternalCallRecord.second->getFunction();
1077-
assert(PotentialCallee);
1078-
if (!isKernelLDS(PotentialCallee))
1079-
PotentialCallee->removeFnAttr("amdgpu-no-lds-kernel-id");
1080-
}
1081-
}
1082-
} else {
1083-
Callee->removeFnAttr("amdgpu-no-lds-kernel-id");
1084-
if (Visited.insert(Callee).second)
1085-
WorkList.push_back(Callee);
1086-
}
1087-
}
1088-
}
1089-
}
1090-
1091865
DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
1092866
Module &M, LDSUsesInfoTy &LDSUsesInfo,
1093867
DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
@@ -1243,7 +1017,7 @@ class AMDGPULowerModuleLDS {
12431017
//
12441018
// TODO: We could filter out subgraphs that do not access LDS globals.
12451019
for (Function *F : KernelsThatAllocateTableLDS)
1246-
removeNoLdsKernelIdFromReachable(CG, F);
1020+
removeFnAttrFromReachable(CG, F, "amdgpu-no-lds-kernel-id");
12471021
}
12481022

12491023
DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
2222
AMDGPULowerBufferFatPointersPass(*this))
2323
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
2424
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
25+
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass())
2526
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
2627
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
2728
#undef MODULE_PASS

0 commit comments

Comments
 (0)