-
Notifications
You must be signed in to change notification settings - Fork 13.6k
Reapply "[MemProf] Reduce cloning overhead by sharing nodes when possible" (#102932) with fixes #106623
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reapply "[MemProf] Reduce cloning overhead by sharing nodes when possible" (#102932) with fixes #106623
Changes from 1 commit
94737fb
0148e96
a9f19fd
f93115f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -532,6 +532,11 @@ class CallsiteContextGraph { | |
Call, Func, CallerFunc, FoundCalleeChain); | ||
} | ||
|
||
/// Returns true if both call instructions have the same callee. | ||
bool sameCallee(CallTy Call1, CallTy Call2) { | ||
return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2); | ||
} | ||
|
||
/// Get a list of nodes corresponding to the stack ids in the given | ||
/// callsite's context. | ||
std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { | ||
|
@@ -678,6 +683,7 @@ class ModuleCallsiteContextGraph | |
bool calleeMatchesFunc( | ||
Instruction *Call, const Function *Func, const Function *CallerFunc, | ||
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain); | ||
bool sameCallee(Instruction *Call1, Instruction *Call2); | ||
bool findProfiledCalleeThroughTailCalls( | ||
const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, | ||
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, | ||
|
@@ -755,6 +761,7 @@ class IndexCallsiteContextGraph | |
IndexCall &Call, const FunctionSummary *Func, | ||
const FunctionSummary *CallerFunc, | ||
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain); | ||
bool sameCallee(IndexCall &Call1, IndexCall &Call2); | ||
bool findProfiledCalleeThroughTailCalls( | ||
ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, | ||
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, | ||
|
@@ -1892,35 +1899,86 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, | |
// from the profiled contexts. | ||
MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap; | ||
|
||
std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode; | ||
for (auto &Entry : NonAllocationCallToContextNodeMap) { | ||
auto *Node = Entry.second; | ||
assert(Node->Clones.empty()); | ||
// Check all node callees and see if in the same function. | ||
auto Call = Node->Call.call(); | ||
for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end(); | ||
++EI) { | ||
auto Edge = *EI; | ||
if (!Edge->Callee->hasCall()) | ||
continue; | ||
assert(NodeToCallingFunc.count(Edge->Callee)); | ||
// Check if the called function matches that of the callee node. | ||
if (calleesMatch(Call, EI, TailCallToContextNodeMap)) | ||
continue; | ||
// We need to check all of the calls recorded in this Node, because in some | ||
// cases we may have had multiple calls with the same debug info calling | ||
// different callees. Here we will prune any that don't match all callee | ||
// nodes. | ||
std::vector<CallInfo> AllCalls = Node->MatchingCalls; | ||
AllCalls.insert(AllCalls.begin(), Node->Call); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this will trigger an alloc and a copy just to add one element. Maybe rewrite to reserve, insert There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
auto It = AllCalls.begin(); | ||
// Iterate through the calls until we find the first that matches. | ||
for (; It != AllCalls.end(); ++It) { | ||
auto ThisCall = *It; | ||
bool Match = true; | ||
for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end(); | ||
++EI) { | ||
auto Edge = *EI; | ||
if (!Edge->Callee->hasCall()) | ||
continue; | ||
assert(NodeToCallingFunc.count(Edge->Callee)); | ||
// Check if the called function matches that of the callee node. | ||
if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) { | ||
Match = false; | ||
break; | ||
} | ||
} | ||
// Found a call that matches the callee nodes, we can quit now. | ||
if (Match) { | ||
// If the first match is not the primary call on the Node, update it | ||
// now. We will update the list of matching calls further below. | ||
if (Node->Call != ThisCall) { | ||
Node->setCall(ThisCall); | ||
// We need to update the NonAllocationCallToContextNodeMap, but don't | ||
// want to do this during iteration over that map, so save the calls | ||
// that need updated entries. | ||
NewCallToNode.push_back({ThisCall, Node}); | ||
// We should only have shared this node between calls from the same | ||
// function. | ||
assert(NodeToCallingFunc[Node] == CallToFunc[Node->Call]); | ||
} | ||
break; | ||
} | ||
} | ||
// We will update this list below (or leave it cleared if there was no | ||
// match found above). | ||
Node->MatchingCalls.clear(); | ||
// If we hit the end of the AllCalls vector, no call matching the callee | ||
// nodes was found, clear the call information in the node. | ||
if (It == AllCalls.end()) { | ||
RemovedEdgesWithMismatchedCallees++; | ||
// Work around by setting Node to have a null call, so it gets | ||
// skipped during cloning. Otherwise assignFunctions will assert | ||
// because its data structures are not designed to handle this case. | ||
Node->setCall(CallInfo()); | ||
break; | ||
continue; | ||
} | ||
// Now add back any matching calls that call the same function as the | ||
// matching primary call on Node. | ||
for (++It; It != AllCalls.end(); ++It) { | ||
auto ThisCall = *It; | ||
if (!sameCallee(Node->Call.call(), ThisCall.call())) | ||
continue; | ||
Node->MatchingCalls.push_back(ThisCall); | ||
} | ||
} | ||
|
||
// Remove all mismatched nodes identified in the above loop from the node map | ||
// (checking whether they have a null call which is set above). For a | ||
// MapVector like NonAllocationCallToContextNodeMap it is much more efficient | ||
// to do the removal via remove_if than by individually erasing entries above. | ||
NonAllocationCallToContextNodeMap.remove_if( | ||
[](const auto &it) { return !it.second->hasCall(); }); | ||
// Also remove any entries if we updated the node's primary call above. | ||
NonAllocationCallToContextNodeMap.remove_if([](const auto &it) { | ||
return !it.second->hasCall() || it.second->Call != it.first; | ||
}); | ||
|
||
// Add entries for any new primary calls recorded above. | ||
for (auto &[Call, Node] : NewCallToNode) | ||
NonAllocationCallToContextNodeMap[Call] = Node; | ||
|
||
// Add the new nodes after the above loop so that the iteration is not | ||
// invalidated. | ||
|
@@ -2146,6 +2204,21 @@ bool ModuleCallsiteContextGraph::calleeMatchesFunc( | |
return true; | ||
} | ||
|
||
bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1, | ||
Instruction *Call2) { | ||
auto *CB1 = dyn_cast<CallBase>(Call1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to check the dyn_cast returned values for nulls? If not then should they be dyn_cast in the first place? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed to cast here. |
||
if (!CB1->getCalledOperand() || CB1->isIndirectCall()) | ||
return false; | ||
auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts(); | ||
auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1); | ||
auto *CB2 = dyn_cast<CallBase>(Call2); | ||
if (!CB2->getCalledOperand() || CB2->isIndirectCall()) | ||
return false; | ||
auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts(); | ||
auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2); | ||
return CalleeFunc1 == CalleeFunc2; | ||
} | ||
|
||
bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls( | ||
ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, | ||
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, | ||
|
@@ -2272,6 +2345,14 @@ bool IndexCallsiteContextGraph::calleeMatchesFunc( | |
return true; | ||
} | ||
|
||
bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) { | ||
ValueInfo Callee1 = | ||
dyn_cast_if_present<CallsiteInfo *>(Call1.getBase())->Callee; | ||
ValueInfo Callee2 = | ||
dyn_cast_if_present<CallsiteInfo *>(Call2.getBase())->Callee; | ||
return Callee1 == Callee2; | ||
} | ||
|
||
template <typename DerivedCCG, typename FuncTy, typename CallTy> | ||
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() | ||
const { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
;; Test to ensure a call to a different callee but with the same debug info | ||
;; (and therefore callsite metadata) as a preceding call in the alloc context | ||
;; does not cause missing or incorrect cloning. This test is otherwise the same | ||
;; as memprof-basic.ll. | ||
|
||
;; -stats requires asserts | ||
; REQUIRES: asserts | ||
|
||
; RUN: opt -thinlto-bc %s >%t.o | ||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ | ||
; RUN: -supports-hot-cold-new \ | ||
; RUN: -r=%t.o,main,plx \ | ||
; RUN: -r=%t.o,blah, \ | ||
; RUN: -r=%t.o,_Znam, \ | ||
; RUN: -memprof-verify-ccg -memprof-verify-nodes \ | ||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ | ||
; RUN: -o %t.out 2>&1 | FileCheck %s \ | ||
; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS | ||
|
||
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR | ||
|
||
source_filename = "memprof-aliased-location1.ll" | ||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" | ||
target triple = "x86_64-unknown-linux-gnu" | ||
|
||
define i32 @main() #0 { | ||
entry: | ||
%call = call ptr @_Z3foov(), !callsite !0 | ||
%call1 = call ptr @_Z3foov(), !callsite !1 | ||
ret i32 0 | ||
} | ||
|
||
declare void @blah() | ||
|
||
define internal ptr @_Z3barv() #0 { | ||
entry: | ||
%call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7 | ||
ret ptr null | ||
} | ||
|
||
declare ptr @_Znam(i64) | ||
|
||
define internal ptr @_Z3bazv() #0 { | ||
entry: | ||
;; Preceding call to another callee but with the same debug location / callsite id | ||
call void @blah(), !callsite !8 | ||
%call = call ptr @_Z3barv(), !callsite !8 | ||
ret ptr null | ||
} | ||
|
||
define internal ptr @_Z3foov() #0 { | ||
entry: | ||
%call = call ptr @_Z3bazv(), !callsite !9 | ||
ret ptr null | ||
} | ||
|
||
; uselistorder directives | ||
uselistorder ptr @_Z3foov, { 1, 0 } | ||
|
||
attributes #0 = { noinline optnone } | ||
|
||
!0 = !{i64 8632435727821051414} | ||
!1 = !{i64 -3421689549917153178} | ||
!2 = !{!3, !5} | ||
!3 = !{!4, !"notcold", i64 100} | ||
!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} | ||
!5 = !{!6, !"cold", i64 400} | ||
!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} | ||
!7 = !{i64 9086428284934609951} | ||
!8 = !{i64 -5964873800580613432} | ||
!9 = !{i64 2732490490862098848} | ||
|
||
; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 | ||
; REMARKS: created clone _Z3barv.memprof.1 | ||
; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold | ||
; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold | ||
; REMARKS: created clone _Z3bazv.memprof.1 | ||
; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 | ||
; REMARKS: created clone _Z3foov.memprof.1 | ||
; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 | ||
|
||
|
||
; IR: define {{.*}} @main | ||
;; The first call to foo does not allocate cold memory. It should call the | ||
;; original functions, which ultimately call the original allocation decorated | ||
;; with a "notcold" attribute. | ||
; IR: call {{.*}} @_Z3foov() | ||
;; The second call to foo allocates cold memory. It should call cloned functions | ||
;; which ultimately call a cloned allocation decorated with a "cold" attribute. | ||
; IR: call {{.*}} @_Z3foov.memprof.1() | ||
; IR: define internal {{.*}} @_Z3barv() | ||
; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] | ||
; IR: define internal {{.*}} @_Z3bazv() | ||
; IR: call {{.*}} @_Z3barv() | ||
; IR: define internal {{.*}} @_Z3foov() | ||
; IR: call {{.*}} @_Z3bazv() | ||
; IR: define internal {{.*}} @_Z3barv.memprof.1() | ||
; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] | ||
; IR: define internal {{.*}} @_Z3bazv.memprof.1() | ||
; IR: call {{.*}} @_Z3barv.memprof.1() | ||
; IR: define internal {{.*}} @_Z3foov.memprof.1() | ||
; IR: call {{.*}} @_Z3bazv.memprof.1() | ||
; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } | ||
; IR: attributes #[[COLD]] = { "memprof"="cold" } | ||
|
||
|
||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) | ||
; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend | ||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) | ||
; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend | ||
; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend | ||
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis | ||
; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend | ||
; STATS-BE: 3 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend | ||
; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend | ||
; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
;; Test to ensure a call to a different callee but with the same debug info | ||
;; (and therefore callsite metadata) as a subsequent call in the alloc context | ||
;; does not cause missing or incorrect cloning. This test is otherwise the same | ||
;; as memprof-basic.ll. | ||
|
||
;; -stats requires asserts | ||
; REQUIRES: asserts | ||
|
||
; RUN: opt -thinlto-bc %s >%t.o | ||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ | ||
; RUN: -supports-hot-cold-new \ | ||
; RUN: -r=%t.o,main,plx \ | ||
; RUN: -r=%t.o,blah, \ | ||
; RUN: -r=%t.o,_Znam, \ | ||
; RUN: -memprof-verify-ccg -memprof-verify-nodes \ | ||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ | ||
; RUN: -o %t.out 2>&1 | FileCheck %s \ | ||
; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS | ||
|
||
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR | ||
|
||
source_filename = "memprof-aliased-location2.ll" | ||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" | ||
target triple = "x86_64-unknown-linux-gnu" | ||
|
||
define i32 @main() #0 { | ||
entry: | ||
%call = call ptr @_Z3foov(), !callsite !0 | ||
%call1 = call ptr @_Z3foov(), !callsite !1 | ||
ret i32 0 | ||
} | ||
|
||
declare void @blah() | ||
|
||
define internal ptr @_Z3barv() #0 { | ||
entry: | ||
%call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7 | ||
ret ptr null | ||
} | ||
|
||
declare ptr @_Znam(i64) | ||
|
||
define internal ptr @_Z3bazv() #0 { | ||
entry: | ||
%call = call ptr @_Z3barv(), !callsite !8 | ||
;; Subsequent call to another callee but with the same debug location / callsite id | ||
call void @blah(), !callsite !8 | ||
ret ptr null | ||
} | ||
|
||
define internal ptr @_Z3foov() #0 { | ||
entry: | ||
%call = call ptr @_Z3bazv(), !callsite !9 | ||
ret ptr null | ||
} | ||
|
||
; uselistorder directives | ||
uselistorder ptr @_Z3foov, { 1, 0 } | ||
|
||
attributes #0 = { noinline optnone } | ||
|
||
!0 = !{i64 8632435727821051414} | ||
!1 = !{i64 -3421689549917153178} | ||
!2 = !{!3, !5} | ||
!3 = !{!4, !"notcold", i64 100} | ||
!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} | ||
!5 = !{!6, !"cold", i64 400} | ||
!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} | ||
!7 = !{i64 9086428284934609951} | ||
!8 = !{i64 -5964873800580613432} | ||
!9 = !{i64 2732490490862098848} | ||
|
||
; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 | ||
; REMARKS: created clone _Z3barv.memprof.1 | ||
; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold | ||
; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold | ||
; REMARKS: created clone _Z3bazv.memprof.1 | ||
; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 | ||
; REMARKS: created clone _Z3foov.memprof.1 | ||
; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 | ||
|
||
|
||
; IR: define {{.*}} @main | ||
;; The first call to foo does not allocate cold memory. It should call the | ||
;; original functions, which ultimately call the original allocation decorated | ||
;; with a "notcold" attribute. | ||
; IR: call {{.*}} @_Z3foov() | ||
;; The second call to foo allocates cold memory. It should call cloned functions | ||
;; which ultimately call a cloned allocation decorated with a "cold" attribute. | ||
; IR: call {{.*}} @_Z3foov.memprof.1() | ||
; IR: define internal {{.*}} @_Z3barv() | ||
; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] | ||
; IR: define internal {{.*}} @_Z3bazv() | ||
; IR: call {{.*}} @_Z3barv() | ||
; IR: define internal {{.*}} @_Z3foov() | ||
; IR: call {{.*}} @_Z3bazv() | ||
; IR: define internal {{.*}} @_Z3barv.memprof.1() | ||
; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] | ||
; IR: define internal {{.*}} @_Z3bazv.memprof.1() | ||
; IR: call {{.*}} @_Z3barv.memprof.1() | ||
; IR: define internal {{.*}} @_Z3foov.memprof.1() | ||
; IR: call {{.*}} @_Z3bazv.memprof.1() | ||
; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } | ||
; IR: attributes #[[COLD]] = { "memprof"="cold" } | ||
|
||
|
||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) | ||
; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend | ||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) | ||
; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend | ||
; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend | ||
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis | ||
; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend | ||
; STATS-BE: 3 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend | ||
; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend | ||
; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same info including column numbers? Can you extend this comment to mention some examples.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes. I added some details for the case I saw to the comment.