Skip to content

Commit 055e431

Browse files
[MemProf] Reduce cloning overhead by sharing nodes when possible (#99832)
When assigning calls to nodes while building the graph, we can share nodes between multiple calls in some cases. Specifically, when we process the list of calls that had the same stack ids (possibly pruned, because we are looking at the stack ids that actually had nodes in the graph due to stack ids in the pruned allocation MIBs), for calls that are located in the same function, we know that they will behave exactly the same through cloning and function assignment. Therefore, instead of creating nodes for all of them (requiring context id duplication), keep a list of additional "matching calls" on the nodes. During function assignment we simply update all the matching calls the same way as the primary call. This change not only reduces the number of nodes (both original and cloned), but also greatly reduces the number of duplicated context ids and the time to propagate them. For a large target, I measured a 25% peak memory reduction and 42% time reduction.
1 parent 7e1fcf5 commit 055e431

File tree

1 file changed

+109
-15
lines changed

1 file changed

+109
-15
lines changed

llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Lines changed: 109 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,16 @@ class CallsiteContextGraph {
242242
// recursion.
243243
bool Recursive = false;
244244

245-
// The corresponding allocation or interior call.
245+
// The corresponding allocation or interior call. This is the primary call
246+
// for which we have created this node.
246247
CallInfo Call;
247248

249+
// List of other calls that can be treated the same as the primary call
250+
// through cloning. I.e. located in the same function and have the same
251+
// (possibly pruned) stack ids. They will be updated the same way as the
252+
// primary call when assigning to function clones.
253+
std::vector<CallInfo> MatchingCalls;
254+
248255
// For alloc nodes this is a unique id assigned when constructed, and for
249256
// callsite stack nodes it is the original stack id when the node is
250257
// constructed from the memprof MIB metadata on the alloc nodes. Note that
@@ -457,6 +464,9 @@ class CallsiteContextGraph {
457464
/// iteration.
458465
MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
459466

467+
/// Records the function each call is located in.
468+
DenseMap<CallInfo, const FuncTy *> CallToFunc;
469+
460470
/// Map from callsite node to the enclosing caller function.
461471
std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
462472

@@ -474,7 +484,8 @@ class CallsiteContextGraph {
474484
/// StackIdToMatchingCalls map.
475485
void assignStackNodesPostOrder(
476486
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
477-
DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
487+
DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
488+
DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
478489

479490
/// Duplicates the given set of context ids, updating the provided
480491
/// map from each original id with the newly generated context ids,
@@ -1230,10 +1241,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
12301241

12311242
template <typename DerivedCCG, typename FuncTy, typename CallTy>
12321243
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1233-
assignStackNodesPostOrder(ContextNode *Node,
1234-
DenseSet<const ContextNode *> &Visited,
1235-
DenseMap<uint64_t, std::vector<CallContextInfo>>
1236-
&StackIdToMatchingCalls) {
1244+
assignStackNodesPostOrder(
1245+
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1246+
DenseMap<uint64_t, std::vector<CallContextInfo>>
1247+
&StackIdToMatchingCalls,
1248+
DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
12371249
auto Inserted = Visited.insert(Node);
12381250
if (!Inserted.second)
12391251
return;
@@ -1246,7 +1258,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
12461258
// Skip any that have been removed during the recursion.
12471259
if (!Edge)
12481260
continue;
1249-
assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1261+
assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1262+
CallToMatchingCall);
12501263
}
12511264

12521265
// If this node's stack id is in the map, update the graph to contain new
@@ -1289,8 +1302,19 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
12891302
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
12901303
// Skip any for which we didn't assign any ids, these don't get a node in
12911304
// the graph.
1292-
if (SavedContextIds.empty())
1305+
if (SavedContextIds.empty()) {
1306+
// If this call has a matching call (located in the same function and
1307+
// having the same stack ids), simply add it to the context node created
1308+
// for its matching call earlier. These can be treated the same through
1309+
// cloning and get updated at the same time.
1310+
if (!CallToMatchingCall.contains(Call))
1311+
continue;
1312+
auto MatchingCall = CallToMatchingCall[Call];
1313+
assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
1314+
NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1315+
Call);
12931316
continue;
1317+
}
12941318

12951319
assert(LastId == Ids.back());
12961320

@@ -1422,6 +1446,10 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
14221446
// there is more than one call with the same stack ids. Their (possibly newly
14231447
// duplicated) context ids are saved in the StackIdToMatchingCalls map.
14241448
DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1449+
// Save a map from each call to any that are found to match it. I.e. located
1450+
// in the same function and have the same (possibly pruned) stack ids. We use
1451+
// this to avoid creating extra graph nodes as they can be treated the same.
1452+
DenseMap<CallInfo, CallInfo> CallToMatchingCall;
14251453
for (auto &It : StackIdToMatchingCalls) {
14261454
auto &Calls = It.getSecond();
14271455
// Skip single calls with a single stack id. These don't need a new node.
@@ -1460,6 +1488,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
14601488
DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
14611489
assert(!LastNodeContextIds.empty());
14621490

1491+
// Map from function to the first call from the below list (with matching
1492+
// stack ids) found in that function. Note that calls from different
1493+
// functions can have the same stack ids because this is the list of stack
1494+
// ids that had (possibly pruned) nodes after building the graph from the
1495+
// allocation MIBs.
1496+
DenseMap<const FuncTy *, CallInfo> FuncToCallMap;
1497+
14631498
for (unsigned I = 0; I < Calls.size(); I++) {
14641499
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
14651500
assert(SavedContextIds.empty());
@@ -1533,6 +1568,18 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
15331568
continue;
15341569
}
15351570

1571+
const FuncTy *CallFunc = CallToFunc[Call];
1572+
1573+
// If the prior call had the same stack ids this map would not be empty.
1574+
// Check if we already have a call that "matches" because it is located
1575+
// in the same function.
1576+
if (FuncToCallMap.contains(CallFunc)) {
1577+
// Record the matching call found for this call, and skip it. We
1578+
// will subsequently combine it into the same node.
1579+
CallToMatchingCall[Call] = FuncToCallMap[CallFunc];
1580+
continue;
1581+
}
1582+
15361583
// Check if the next set of stack ids is the same (since the Calls vector
15371584
// of tuples is sorted by the stack ids we can just look at the next one).
15381585
bool DuplicateContextIds = false;
@@ -1562,7 +1609,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
15621609
set_subtract(LastNodeContextIds, StackSequenceContextIds);
15631610
if (LastNodeContextIds.empty())
15641611
break;
1565-
}
1612+
// No longer possibly in a sequence of calls with duplicate stack ids,
1613+
// clear the map.
1614+
FuncToCallMap.clear();
1615+
} else
1616+
// Record the call with its function, so we can locate it the next time
1617+
// we find a call from this function when processing the calls with the
1618+
// same stack ids.
1619+
FuncToCallMap[CallFunc] = Call;
15661620
}
15671621
}
15681622

@@ -1579,7 +1633,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
15791633
// associated context ids over to the new nodes.
15801634
DenseSet<const ContextNode *> Visited;
15811635
for (auto &Entry : AllocationCallToContextNodeMap)
1582-
assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1636+
assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
1637+
CallToMatchingCall);
15831638
if (VerifyCCG)
15841639
check();
15851640
}
@@ -1679,6 +1734,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
16791734
continue;
16801735
if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
16811736
CallsWithMetadata.push_back(&I);
1737+
CallToFunc[&I] = &F;
16821738
auto *AllocNode = addAllocNode(&I, &F);
16831739
auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
16841740
assert(CallsiteMD);
@@ -1700,8 +1756,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
17001756
I.setMetadata(LLVMContext::MD_callsite, nullptr);
17011757
}
17021758
// For callsite metadata, add to list for this function for later use.
1703-
else if (I.getMetadata(LLVMContext::MD_callsite))
1759+
else if (I.getMetadata(LLVMContext::MD_callsite)) {
17041760
CallsWithMetadata.push_back(&I);
1761+
CallToFunc[&I] = &F;
1762+
}
17051763
}
17061764
}
17071765
if (!CallsWithMetadata.empty())
@@ -1756,8 +1814,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
17561814
// correlate properly in applyImport in the backends.
17571815
if (AN.MIBs.empty())
17581816
continue;
1759-
CallsWithMetadata.push_back({&AN});
1760-
auto *AllocNode = addAllocNode({&AN}, FS);
1817+
IndexCall AllocCall(&AN);
1818+
CallsWithMetadata.push_back(AllocCall);
1819+
CallToFunc[AllocCall] = FS;
1820+
auto *AllocNode = addAllocNode(AllocCall, FS);
17611821
// Pass an empty CallStack to the CallsiteContext (second)
17621822
// parameter, since for ThinLTO we already collapsed out the inlined
17631823
// stack ids on the allocation call during ModuleSummaryAnalysis.
@@ -1788,8 +1848,11 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
17881848
}
17891849
// For callsite metadata, add to list for this function for later use.
17901850
if (!FS->callsites().empty())
1791-
for (auto &SN : FS->mutableCallsites())
1792-
CallsWithMetadata.push_back({&SN});
1851+
for (auto &SN : FS->mutableCallsites()) {
1852+
IndexCall StackNodeCall(&SN);
1853+
CallsWithMetadata.push_back(StackNodeCall);
1854+
CallToFunc[StackNodeCall] = FS;
1855+
}
17931856

17941857
if (!CallsWithMetadata.empty())
17951858
FuncToCallsWithMetadata[FS] = CallsWithMetadata;
@@ -2225,6 +2288,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
22252288
if (Recursive)
22262289
OS << " (recursive)";
22272290
OS << "\n";
2291+
if (!MatchingCalls.empty()) {
2292+
OS << "\tMatchingCalls:\n";
2293+
for (auto &MatchingCall : MatchingCalls) {
2294+
OS << "\t";
2295+
MatchingCall.print(OS);
2296+
OS << "\n";
2297+
}
2298+
}
22282299
OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
22292300
OS << "\tContextIds:";
22302301
// Make a copy of the computed context ids that we can sort for stability.
@@ -2478,6 +2549,7 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
24782549
std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
24792550
ContextNode *Clone = NodeOwner.back().get();
24802551
Node->addClone(Clone);
2552+
Clone->MatchingCalls = Node->MatchingCalls;
24812553
assert(NodeToCallingFunc.count(Node));
24822554
NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
24832555
moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
@@ -3021,6 +3093,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
30213093
if (CallMap.count(Call))
30223094
CallClone = CallMap[Call];
30233095
CallsiteClone->setCall(CallClone);
3096+
// Need to do the same for all matching calls.
3097+
for (auto &MatchingCall : Node->MatchingCalls) {
3098+
CallInfo CallClone(MatchingCall);
3099+
if (CallMap.count(MatchingCall))
3100+
CallClone = CallMap[MatchingCall];
3101+
// Updates the call in the list.
3102+
MatchingCall = CallClone;
3103+
}
30243104
};
30253105

30263106
// Keep track of the clones of callsite Node that need to be assigned to
@@ -3187,6 +3267,16 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
31873267
CallInfo NewCall(CallMap[OrigCall]);
31883268
assert(NewCall);
31893269
NewClone->setCall(NewCall);
3270+
// Need to do the same for all matching calls.
3271+
for (auto &MatchingCall : NewClone->MatchingCalls) {
3272+
CallInfo OrigMatchingCall(MatchingCall);
3273+
OrigMatchingCall.setCloneNo(0);
3274+
assert(CallMap.count(OrigMatchingCall));
3275+
CallInfo NewCall(CallMap[OrigMatchingCall]);
3276+
assert(NewCall);
3277+
// Updates the call in the list.
3278+
MatchingCall = NewCall;
3279+
}
31903280
}
31913281
}
31923282
// Fall through to handling below to perform the recording of the
@@ -3373,6 +3463,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
33733463

33743464
if (Node->IsAllocation) {
33753465
updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3466+
assert(Node->MatchingCalls.empty());
33763467
return;
33773468
}
33783469

@@ -3381,6 +3472,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
33813472

33823473
auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
33833474
updateCall(Node->Call, CalleeFunc);
3475+
// Update all the matching calls as well.
3476+
for (auto &Call : Node->MatchingCalls)
3477+
updateCall(Call, CalleeFunc);
33843478
};
33853479

33863480
// Performs DFS traversal starting from allocation nodes to update calls to

0 commit comments

Comments
 (0)