Skip to content

[AMDGPU][SplitModule] Cleanup CallsExternal Handling #106528

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 63 additions & 22 deletions llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/User.h"
Expand Down Expand Up @@ -103,6 +104,11 @@ static cl::opt<bool> NoExternalizeGlobals(
cl::desc("disables externalization of global variable with local linkage; "
"may cause globals to be duplicated which increases binary size"));

static cl::opt<bool> NoExternalizeOnAddrTaken(
"amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden,
cl::desc(
"disables externalization of functions whose addresses are taken"));

static cl::opt<std::string>
ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
cl::Hidden,
Expand Down Expand Up @@ -482,6 +488,9 @@ void SplitGraph::buildGraph(CallGraph &CG) {
dbgs()
<< "[build graph] constructing graph representation of the input\n");

// FIXME(?): Is the callgraph really worth using if we have to iterate the
// function again whenever it fails to give us enough information?

// We build the graph by just iterating all functions in the module and
// working on their direct callees. At the end, all nodes should be linked
// together as expected.
Expand All @@ -492,29 +501,52 @@ void SplitGraph::buildGraph(CallGraph &CG) {
continue;

// Look at direct callees and create the necessary edges in the graph.
bool HasIndirectCall = false;
Node &N = getNode(Cache, Fn);
SetVector<const Function *> DirectCallees;
bool CallsExternal = false;
for (auto &CGEntry : *CG[&Fn]) {
auto *CGNode = CGEntry.second;
auto *Callee = CGNode->getFunction();
if (!Callee) {
// TODO: Don't consider inline assembly as indirect calls.
if (CGNode == CG.getCallsExternalNode())
HasIndirectCall = true;
continue;
}

if (!Callee->isDeclaration())
createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
if (auto *Callee = CGNode->getFunction()) {
if (!Callee->isDeclaration())
DirectCallees.insert(Callee);
} else if (CGNode == CG.getCallsExternalNode())
CallsExternal = true;
}

// Keep track of this function if it contains an indirect call and/or if it
// can be indirectly called.
if (HasIndirectCall) {
LLVM_DEBUG(dbgs() << "indirect call found in " << Fn.getName() << "\n");
FnsWithIndirectCalls.push_back(&Fn);
if (CallsExternal) {
LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for ";
Fn.printAsOperand(dbgs());
dbgs() << " - analyzing function\n");

bool HasIndirectCall = false;
for (const auto &Inst : instructions(Fn)) {
// look at all calls without a direct callee.
if (const auto *CB = dyn_cast<CallBase>(&Inst);
CB && !CB->getCalledFunction()) {
// inline assembly can be ignored, unless InlineAsmIsIndirectCall is
// true.
if (CB->isInlineAsm()) {
LLVM_DEBUG(dbgs() << " found inline assembly\n");
continue;
}

// everything else is handled conservatively.
HasIndirectCall = true;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can break at this point? Also can just turn this into a helper function to scan for indirect calls and use early return

break;
}
}

if (HasIndirectCall) {
LLVM_DEBUG(dbgs() << " indirect call found\n");
FnsWithIndirectCalls.push_back(&Fn);
}
}

Node &N = getNode(Cache, Fn);
for (const auto *Callee : DirectCallees)
createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);

if (canBeIndirectlyCalled(Fn))
IndirectlyCallableFns.push_back(&Fn);
}
Expand Down Expand Up @@ -1326,13 +1358,21 @@ static void splitAMDGPUModule(
//
// Additionally, it guides partitioning to not duplicate this function if it's
// called directly at some point.
for (auto &Fn : M) {
if (Fn.hasAddressTaken()) {
if (Fn.hasLocalLinkage()) {
LLVM_DEBUG(dbgs() << "[externalize] " << Fn.getName()
<< " because its address is taken\n");
//
// TODO: Could we be smarter about this ? This makes all functions whose
// addresses are taken non-copyable. We should probably model this type of
// constraint in the graph and use it to guide splitting, instead of
// externalizing like this. Maybe non-copyable should really mean "keep one
// visible copy, then internalize all other copies" for some functions?
if (!NoExternalizeOnAddrTaken) {
for (auto &Fn : M) {
// TODO: Should aliases count? Probably not but they're so rare I'm not
// sure it's worth fixing.
if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
LLVM_DEBUG(dbgs() << "[externalize] "; Fn.printAsOperand(dbgs());
dbgs() << " because its address is taken\n");
externalize(Fn);
}
externalize(Fn);
}
}

Expand Down Expand Up @@ -1368,7 +1408,8 @@ static void splitAMDGPUModule(
dbgs() << "[graph] nodes:\n";
for (const SplitGraph::Node *N : SG.nodes()) {
dbgs() << " - [" << N->getID() << "]: " << N->getName() << " "
<< (N->isGraphEntryPoint() ? "(entry)" : "") << "\n";
<< (N->isGraphEntryPoint() ? "(entry)" : "") << " "
<< (N->isNonCopyable() ? "(noncopyable)" : "") << "\n";
}
});

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
; REQUIRES: asserts

; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-address-taken -debug-only=amdgpu-split-module 2>&1 | FileCheck %s

; CHECK: [!] callgraph is incomplete for ptr @A - analyzing function
; CHECK-NEXT: found inline assembly
; CHECK-NOT: indirect call found

@addrthief = global [2 x ptr] [ptr @HelperA, ptr @HelperB]

define internal void @HelperA() {
ret void
}

define internal void @HelperB() {
ret void
}

define amdgpu_kernel void @A() {
call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"()
call void @HelperA()
ret void
}

define amdgpu_kernel void @B(ptr %out) {
call void @HelperB()
ret void
}
30 changes: 30 additions & 0 deletions llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-address-taken
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s

; CHECK0: define internal void @HelperB
; CHECK0: define amdgpu_kernel void @B

; CHECK1: define internal void @HelperA()
; CHECK1: define amdgpu_kernel void @A()

@addrthief = global [2 x ptr] [ptr @HelperA, ptr @HelperB]

define internal void @HelperA() {
ret void
}

define internal void @HelperB() {
ret void
}

define amdgpu_kernel void @A() {
call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"()
call void @HelperA()
ret void
}

define amdgpu_kernel void @B(ptr %out) {
call void @HelperB()
ret void
}
41 changes: 0 additions & 41 deletions llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll

This file was deleted.

12 changes: 0 additions & 12 deletions llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,6 @@
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s

; We have 4 kernels:
; - Each kernel has an internal helper
; - @A and @B's helpers does an indirect call.
;
; We default to putting A/B in P0, alongside a copy
; of all helpers who have their address taken.
; The other kernels can still go into separate partitions.
;
; Note that dependency discovery shouldn't stop upon finding an
; indirect call. HelperC/D should also end up in P0 as they
; are dependencies of HelperB.

; CHECK0: define internal void @HelperD
; CHECK0: define amdgpu_kernel void @D

Expand Down
Loading