Skip to content

Commit 764614e

Browse files
[MemProf] Restructure the pruning of unneeded NotCold contexts (#138792)
This change is mostly NFC, other than the addition of a new message printed when contexts are pruned when -memprof-report-hinted-sizes is enabled. To prepare for a follow on change, adjust the way we determine which NotCold contexts can be pruned (because they overlap with longer NotCold contexts), and change the way we perform this pruning. Instead of determining the points at which we need to keep NotCold contexts during the building of the trie, we now determine this on the fly as the MIB metadata nodes are recursively built. This simplifies a follow on change that performs additional pruning of some NotCold contexts, and which can affect which others need to be kept as the longest overlapping NotCold contexts.
1 parent c1f0e68 commit 764614e

File tree

3 files changed

+123
-79
lines changed

3 files changed

+123
-79
lines changed

llvm/include/llvm/Analysis/MemoryProfileInfo.h

Lines changed: 1 addition & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -56,37 +56,6 @@ class CallStackTrie {
5656
// Allocation types for call context sharing the context prefix at this
5757
// node.
5858
uint8_t AllocTypes;
59-
// Updated as we add allocations to note if this is the deepest point in the
60-
// trie that has an ambiguous allocation type (both Cold and NotCold). It is
61-
// used to prune unneeded NotCold contexts, taking advantage of the fact
62-
// that we later will only clone Cold contexts, as NotCold is the allocation
63-
// default. We only need to keep as metadata the NotCold contexts that
64-
// overlap the longest with Cold allocations, so that we know how deeply we
65-
// need to clone. For example, assume we add the following contexts to the
66-
// trie:
67-
// 1 3 (notcold)
68-
// 1 2 4 (cold)
69-
// 1 2 5 (notcold)
70-
// 1 2 6 (notcold)
71-
// the trie looks like:
72-
// 1
73-
// / \
74-
// 2 3
75-
// /|\
76-
// 4 5 6
77-
//
78-
// It is sufficient to prune all but one not cold contexts (either 1,2,5 or
79-
// 1,2,6, we arbitrarily keep the first one we encounter which will be
80-
// 1,2,5). We'll initially have DeepestAmbiguousAllocType set false for trie
81-
// node 1 after the trie is built, and true for node 2. This indicates that
82-
// the not cold context ending in 3 is not needed (its immediate callee has
83-
// this value set false). The first notcold context we encounter when
84-
// iterating the callers of node 2 will be the context ending in 5 (since
85-
// std::map iteration is in sorted order of key), which will see that this
86-
// field is true for its callee, so we will keep it. But at that point we
87-
// set the callee's flag to false which prevents adding the not cold context
88-
// ending in 6 unnecessarily.
89-
bool DeepestAmbiguousAllocType = true;
9059
// If the user has requested reporting of hinted sizes, keep track of the
9160
// associated full stack id and profiled sizes. Can have more than one
9261
// after trimming (e.g. when building from metadata). This is only placed on
@@ -134,8 +103,7 @@ class CallStackTrie {
134103
bool buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
135104
std::vector<uint64_t> &MIBCallStack,
136105
std::vector<Metadata *> &MIBNodes,
137-
bool CalleeHasAmbiguousCallerContext,
138-
bool &CalleeDeepestAmbiguousAllocType);
106+
bool CalleeHasAmbiguousCallerContext);
139107

140108
public:
141109
CallStackTrie() = default;

llvm/lib/Analysis/MemoryProfileInfo.cpp

Lines changed: 110 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -163,16 +163,10 @@ void CallStackTrie::addCallStack(
163163
continue;
164164
}
165165
// Update existing caller node if it exists.
166-
CallStackTrieNode *Prev = nullptr;
167166
auto [Next, Inserted] = Curr->Callers.try_emplace(StackId);
168167
if (!Inserted) {
169-
Prev = Curr;
170168
Curr = Next->second;
171169
Curr->addAllocType(AllocType);
172-
// If this node has an ambiguous alloc type, its callee is not the deepest
173-
// point where we have an ambigous allocation type.
174-
if (!hasSingleAllocType(Curr->AllocTypes))
175-
Prev->DeepestAmbiguousAllocType = false;
176170
continue;
177171
}
178172
// Otherwise add a new caller node.
@@ -248,41 +242,114 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) {
248242
convertHotToNotCold(Caller.second);
249243
}
250244

245+
// Copy over some or all of NewMIBNodes to the SavedMIBNodes vector, depending
246+
// on options that enable filtering out some NotCold contexts.
247+
static void SaveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
248+
std::vector<Metadata *> &SavedMIBNodes,
249+
unsigned CallerContextLength) {
250+
// In the simplest case, with pruning disabled, keep all the new MIB nodes.
251+
if (MemProfKeepAllNotColdContexts)
252+
append_range(SavedMIBNodes, NewMIBNodes);
253+
254+
auto EmitMessageForRemovedContexts = [](const MDNode *MIBMD, StringRef Tag,
255+
StringRef Extra) {
256+
assert(MIBMD->getNumOperands() > 2);
257+
for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
258+
MDNode *ContextSizePair = dyn_cast<MDNode>(MIBMD->getOperand(I));
259+
assert(ContextSizePair->getNumOperands() == 2);
260+
uint64_t FullStackId =
261+
mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(0))
262+
->getZExtValue();
263+
uint64_t TS =
264+
mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(1))
265+
->getZExtValue();
266+
errs() << "MemProf hinting: Total size for " << Tag
267+
<< " non-cold full allocation context hash " << FullStackId
268+
<< Extra << ": " << TS << "\n";
269+
}
270+
};
271+
272+
// Prune unneeded NotCold contexts, taking advantage of the fact
273+
// that we later will only clone Cold contexts, as NotCold is the allocation
274+
// default. We only need to keep as metadata the NotCold contexts that
275+
// overlap the longest with Cold allocations, so that we know how deeply we
276+
// need to clone. For example, assume we add the following contexts to the
277+
// trie:
278+
// 1 3 (notcold)
279+
// 1 2 4 (cold)
280+
// 1 2 5 (notcold)
281+
// 1 2 6 (notcold)
282+
// the trie looks like:
283+
// 1
284+
// / \
285+
// 2 3
286+
// /|\
287+
// 4 5 6
288+
//
289+
// It is sufficient to prune all but one not-cold contexts (either 1,2,5 or
290+
// 1,2,6, we arbitrarily keep the first one we encounter which will be
291+
// 1,2,5).
292+
//
293+
// To do this pruning, we first check if there were any not-cold
294+
// contexts kept for a deeper caller, which will have a context length larger
295+
// than the CallerContextLength being handled here (i.e. kept by a deeper
296+
// recursion step). If so, none of the not-cold MIB nodes added for the
297+
// immediate callers need to be kept. If not, we keep the first (created
298+
// for the immediate caller) not-cold MIB node.
299+
bool LongerNotColdContextKept = false;
300+
for (auto *MIB : NewMIBNodes) {
301+
auto MIBMD = cast<MDNode>(MIB);
302+
if (getMIBAllocType(MIBMD) == AllocationType::Cold)
303+
continue;
304+
MDNode *StackMD = getMIBStackNode(MIBMD);
305+
assert(StackMD);
306+
if (StackMD->getNumOperands() > CallerContextLength) {
307+
LongerNotColdContextKept = true;
308+
break;
309+
}
310+
}
311+
// Don't need to emit any for the immediate caller if we already have
312+
// longer overlapping contexts;
313+
bool KeepFirstNewNotCold = !LongerNotColdContextKept;
314+
auto NewColdMIBNodes = make_filter_range(NewMIBNodes, [&](const Metadata *M) {
315+
auto MIBMD = cast<MDNode>(M);
316+
// Only keep cold contexts and first (longest non-cold context).
317+
if (getMIBAllocType(MIBMD) != AllocationType::Cold) {
318+
MDNode *StackMD = getMIBStackNode(MIBMD);
319+
assert(StackMD);
320+
// Keep any already kept for longer contexts.
321+
if (StackMD->getNumOperands() > CallerContextLength)
322+
return true;
323+
// Otherwise keep the first one added by the immediate caller if there
324+
// were no longer contexts.
325+
if (KeepFirstNewNotCold) {
326+
KeepFirstNewNotCold = false;
327+
return true;
328+
}
329+
if (MemProfReportHintedSizes)
330+
EmitMessageForRemovedContexts(MIBMD, "pruned", "");
331+
return false;
332+
}
333+
return true;
334+
});
335+
for (auto *M : NewColdMIBNodes)
336+
SavedMIBNodes.push_back(M);
337+
}
338+
251339
// Recursive helper to trim contexts and create metadata nodes.
252340
// Caller should have pushed Node's loc to MIBCallStack. Doing this in the
253341
// caller makes it simpler to handle the many early returns in this method.
254342
bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
255343
std::vector<uint64_t> &MIBCallStack,
256344
std::vector<Metadata *> &MIBNodes,
257-
bool CalleeHasAmbiguousCallerContext,
258-
bool &CalleeDeepestAmbiguousAllocType) {
345+
bool CalleeHasAmbiguousCallerContext) {
259346
// Trim context below the first node in a prefix with a single alloc type.
260347
// Add an MIB record for the current call stack prefix.
261348
if (hasSingleAllocType(Node->AllocTypes)) {
262-
// Because we only clone cold contexts (we don't clone for exposing NotCold
263-
// contexts as that is the default allocation behavior), we create MIB
264-
// metadata for this context if any of the following are true:
265-
// 1) It is cold.
266-
// 2) The immediate callee is the deepest point where we have an ambiguous
267-
// allocation type (i.e. the other callers that are cold need to know
268-
// that we have a not cold context overlapping to this point so that we
269-
// know how deep to clone).
270-
// 3) MemProfKeepAllNotColdContexts is enabled, which is useful if we are
271-
// reporting hinted sizes, and want to get information from the indexing
272-
// step for all contexts, or have specified a value less than 100% for
273-
// -memprof-cloning-cold-threshold.
274-
if (Node->hasAllocType(AllocationType::Cold) ||
275-
CalleeDeepestAmbiguousAllocType || MemProfKeepAllNotColdContexts) {
276-
std::vector<ContextTotalSize> ContextSizeInfo;
277-
collectContextSizeInfo(Node, ContextSizeInfo);
278-
MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack,
279-
(AllocationType)Node->AllocTypes,
280-
ContextSizeInfo));
281-
// If we just emitted an MIB for a not cold caller, don't need to emit
282-
// another one for the callee to correctly disambiguate its cold callers.
283-
if (!Node->hasAllocType(AllocationType::Cold))
284-
CalleeDeepestAmbiguousAllocType = false;
285-
}
349+
std::vector<ContextTotalSize> ContextSizeInfo;
350+
collectContextSizeInfo(Node, ContextSizeInfo);
351+
MIBNodes.push_back(createMIBNode(
352+
Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo));
286353
return true;
287354
}
288355

@@ -291,14 +358,21 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
291358
if (!Node->Callers.empty()) {
292359
bool NodeHasAmbiguousCallerContext = Node->Callers.size() > 1;
293360
bool AddedMIBNodesForAllCallerContexts = true;
361+
// Accumulate all new MIB nodes by the recursive calls below into a vector
362+
// that will later be filtered before adding to the caller's MIBNodes
363+
// vector.
364+
std::vector<Metadata *> NewMIBNodes;
294365
for (auto &Caller : Node->Callers) {
295366
MIBCallStack.push_back(Caller.first);
296-
AddedMIBNodesForAllCallerContexts &= buildMIBNodes(
297-
Caller.second, Ctx, MIBCallStack, MIBNodes,
298-
NodeHasAmbiguousCallerContext, Node->DeepestAmbiguousAllocType);
367+
AddedMIBNodesForAllCallerContexts &=
368+
buildMIBNodes(Caller.second, Ctx, MIBCallStack, NewMIBNodes,
369+
NodeHasAmbiguousCallerContext);
299370
// Remove Caller.
300371
MIBCallStack.pop_back();
301372
}
373+
// Pass in the stack length of the MIB nodes added for the immediate caller,
374+
// which is the current stack length plus 1.
375+
SaveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1);
302376
if (AddedMIBNodesForAllCallerContexts)
303377
return true;
304378
// We expect that the callers should be forced to add MIBs to disambiguate
@@ -372,13 +446,8 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
372446
// The CalleeHasAmbiguousCallerContext flag is meant to say whether the
373447
// callee of the given node has more than one caller. Here the node being
374448
// passed in is the alloc and it has no callees. So it's false.
375-
// Similarly, the last parameter is meant to say whether the callee of the
376-
// given node is the deepest point where we have ambiguous alloc types, which
377-
// is also false as the alloc has no callees.
378-
bool DeepestAmbiguousAllocType = true;
379449
if (buildMIBNodes(Alloc, Ctx, MIBCallStack, MIBNodes,
380-
/*CalleeHasAmbiguousCallerContext=*/false,
381-
DeepestAmbiguousAllocType)) {
450+
/*CalleeHasAmbiguousCallerContext=*/false)) {
382451
assert(MIBCallStack.size() == 1 &&
383452
"Should only be left with Alloc's location in stack");
384453
CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));

llvm/test/Transforms/PGOProfile/memprof.ll

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,20 @@
6363
;; give both memprof and pgo metadata.
6464
; RUN: opt < %s -passes='pgo-instr-use,memprof-use<profile-filename=%t.pgomemprofdata>' -pgo-test-profile-file=%t.pgomemprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,PGO
6565

66-
;; Check that the total sizes are reported if requested.
67-
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-report-hinted-sizes -memprof-keep-all-not-cold-contexts 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZES
66+
;; Check that the total sizes are reported if requested. A message should be
67+
;; emitted for the pruned context.
68+
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-report-hinted-sizes 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZES,TOTALSIZENOKEEPALL
69+
70+
;; Check that the total sizes are reported if requested, and prevent pruning
71+
;; via -memprof-keep-all-not-cold-contexts.
72+
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-report-hinted-sizes -memprof-keep-all-not-cold-contexts 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZES,TOTALSIZESKEEPALL
6873

6974
;; Check that we hint additional allocations with a threshold < 100%
7075
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-report-hinted-sizes -memprof-matching-cold-threshold=60 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZESTHRESH60
7176

7277
;; Make sure that the -memprof-cloning-cold-threshold flag is enough to cause
7378
;; the size metadata to be generated for the LTO link.
74-
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-cloning-cold-threshold=80 -memprof-keep-all-not-cold-contexts 2>&1 | FileCheck %s --check-prefixes=TOTALSIZES
79+
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-cloning-cold-threshold=80 -memprof-keep-all-not-cold-contexts 2>&1 | FileCheck %s --check-prefixes=TOTALSIZES,TOTALSIZESKEEPALL
7580

7681
;; Make sure we emit a random hotness seed if requested.
7782
; RUN: llvm-profdata merge -memprof-random-hotness %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand 2>&1 | FileCheck %s --check-prefix=RAND
@@ -370,6 +375,8 @@ for.end: ; preds = %for.cond
370375
; MEMPROF: ![[C10]] = !{i64 2061451396820446691}
371376
; MEMPROF: ![[C11]] = !{i64 1544787832369987002}
372377

378+
; TOTALSIZENOKEEPALL: Total size for pruned non-cold full allocation context hash 1093248920606587996: 10
379+
373380
;; For non-context sensitive allocations that get attributes we emit a message
374381
;; with the full allocation context hash, type, and size in bytes.
375382
; TOTALSIZESTHRESH60: Total size for full allocation context hash 8525406123785421946 and dominant alloc type cold: 10
@@ -393,8 +400,8 @@ for.end: ; preds = %for.cond
393400
; TOTALSIZES: !"cold", ![[CONTEXT4:[0-9]+]], ![[CONTEXT5:[0-9]+]]}
394401
; TOTALSIZES: ![[CONTEXT4]] = !{i64 -2103941543456458045, i64 10}
395402
; TOTALSIZES: ![[CONTEXT5]] = !{i64 -191931298737547222, i64 10}
396-
; TOTALSIZES: !"notcold", ![[CONTEXT6:[0-9]+]]}
397-
; TOTALSIZES: ![[CONTEXT6]] = !{i64 1093248920606587996, i64 10}
403+
; TOTALSIZESKEEPALL: !"notcold", ![[CONTEXT6:[0-9]+]]}
404+
; TOTALSIZESKEEPALL: ![[CONTEXT6]] = !{i64 1093248920606587996, i64 10}
398405

399406
; MEMPROFNOCOLINFO: #[[A1]] = { builtin allocsize(0) "memprof"="notcold" }
400407
; MEMPROFNOCOLINFO: #[[A2]] = { builtin allocsize(0) "memprof"="cold" }

0 commit comments

Comments
 (0)