Skip to content

Commit 63bb007

Browse files
authored
[ctxprof] Auto root detection: trie for stack samples (#133106)
An initial patch for supporting automated root detection. The auto-detector is introduced subsequently, but this patch introduces a datastructure for capturing sampled stacks, per thread, in a trie, and inferring from such samples which functions are reasonable roots.
1 parent 854d795 commit 63bb007

File tree

5 files changed

+307
-1
lines changed

5 files changed

+307
-1
lines changed

compiler-rt/lib/ctx_profile/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@ add_compiler_rt_component(ctx_profile)
22

33
set(CTX_PROFILE_SOURCES
44
CtxInstrProfiling.cpp
5+
RootAutoDetector.cpp
56
)
67

78
set(CTX_PROFILE_HEADERS
89
CtxInstrContextNode.h
910
CtxInstrProfiling.h
11+
RootAutoDetector.h
1012
)
1113

1214
include_directories(..)
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
//===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "RootAutoDetector.h"
10+
11+
#include "sanitizer_common/sanitizer_common.h"
12+
#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
13+
#include <assert.h>
14+
#include <dlfcn.h>
15+
#include <pthread.h>
16+
17+
using namespace __ctx_profile;
18+
template <typename T> using Set = DenseMap<T, bool>;
19+
20+
uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
21+
// this requires --linkopt=-Wl,--export-dynamic
22+
Dl_info Info;
23+
if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0)
24+
return reinterpret_cast<uptr>(Info.dli_saddr);
25+
return 0;
26+
}
27+
28+
void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
29+
++TheTrie.Count;
30+
auto *Current = &TheTrie;
31+
// the stack is backwards - the first callsite is at the top.
32+
for (int I = ST.size - 1; I >= 0; --I) {
33+
uptr ChildAddr = ST.trace[I];
34+
auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)});
35+
++Iter->second.Count;
36+
Current = &Iter->second;
37+
}
38+
}
39+
40+
DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
41+
// Assuming a message pump design, roots are those functions called by the
42+
// message pump. The message pump is an infinite loop (for all practical
43+
// considerations) fetching data from a queue. The root functions return -
44+
// otherwise the message pump doesn't work. This function detects roots as the
45+
// first place in the trie (starting from the root) where a function calls 2
46+
// or more functions.
47+
//
48+
// We start with a callsite trie - the nodes are callsites. Different child
49+
// nodes may actually correspond to the same function.
50+
//
51+
// For example: using function(callsite)
52+
// f1(csf1_1) -> f2(csf2_1) -> f3
53+
// -> f2(csf2_2) -> f4
54+
//
55+
// would be represented in our trie as:
56+
// csf1_1 -> csf2_1 -> f3
57+
// -> csf2_2 -> f4
58+
//
59+
// While we can assert the control flow returns to f2, we don't know if it
60+
// ever returns to f1. f2 could be the message pump.
61+
//
62+
// We need to convert our callsite tree into a function tree. We can also,
63+
// more economically, just see how many distinct functions there are at a
64+
// certain depth. When that count is greater than 1, we got to potential roots
65+
// and everything above should be considered as non-roots.
66+
DenseMap<uptr, uint64_t> Result;
67+
Set<const Trie *> Worklist;
68+
Worklist.insert({&TheTrie, {}});
69+
70+
while (!Worklist.empty()) {
71+
Set<const Trie *> NextWorklist;
72+
DenseMap<uptr, uint64_t> Candidates;
73+
Worklist.forEach([&](const auto &KVP) {
74+
auto [Node, _] = KVP;
75+
auto SA = getFctStartAddr(Node->CallsiteAddress);
76+
Candidates[SA] += Node->Count;
77+
Node->Children.forEach([&](auto &ChildKVP) {
78+
NextWorklist.insert({&ChildKVP.second, true});
79+
return true;
80+
});
81+
return true;
82+
});
83+
if (Candidates.size() > 1) {
84+
Result.swap(Candidates);
85+
break;
86+
}
87+
Worklist.swap(NextWorklist);
88+
}
89+
return Result;
90+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*===- RootAutodetector.h- auto-detect roots for ctxprof -----------------===*\
2+
|*
3+
|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
|* See https://llvm.org/LICENSE.txt for license information.
5+
|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
|*
7+
\*===----------------------------------------------------------------------===*/
8+
9+
#ifndef CTX_PROFILE_ROOTAUTODETECTOR_H_
10+
#define CTX_PROFILE_ROOTAUTODETECTOR_H_
11+
12+
#include "sanitizer_common/sanitizer_dense_map.h"
13+
#include "sanitizer_common/sanitizer_internal_defs.h"
14+
#include "sanitizer_common/sanitizer_stacktrace.h"
15+
#include <pthread.h>
16+
#include <sanitizer/common_interface_defs.h>
17+
18+
using namespace __asan;
19+
using namespace __sanitizer;
20+
21+
namespace __ctx_profile {
22+
23+
/// Capture all the stack traces observed for a specific thread. The "for a
24+
/// specific thread" part is not enforced, but assumed in determineRoots.
25+
class PerThreadCallsiteTrie {
26+
protected:
27+
/// A trie. A node is the address of a callsite in a function activation. A
28+
/// child is a callsite in the activation made from the callsite
29+
/// corresponding to the parent.
30+
struct Trie final {
31+
const uptr CallsiteAddress;
32+
uint64_t Count = 0;
33+
DenseMap<uptr, Trie> Children;
34+
35+
Trie(uptr CallsiteAddress = 0) : CallsiteAddress(CallsiteAddress) {}
36+
};
37+
Trie TheTrie;
38+
39+
/// Return the runtime start address of the function that contains the call at
40+
/// the runtime address CallsiteAddress. May be overriden for easy testing.
41+
virtual uptr getFctStartAddr(uptr CallsiteAddress) const;
42+
43+
public:
44+
PerThreadCallsiteTrie(const PerThreadCallsiteTrie &) = delete;
45+
PerThreadCallsiteTrie(PerThreadCallsiteTrie &&) = default;
46+
PerThreadCallsiteTrie() = default;
47+
48+
virtual ~PerThreadCallsiteTrie() = default;
49+
50+
void insertStack(const StackTrace &ST);
51+
52+
/// Return the runtime address of root functions, as determined for this
53+
/// thread, together with the number of samples that included them.
54+
DenseMap<uptr, uint64_t> determineRoots() const;
55+
};
56+
} // namespace __ctx_profile
57+
#endif

compiler-rt/lib/ctx_profile/tests/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@ append_list_if(COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG -Wno-variadic-macros CTX_PR
2222
file(GLOB CTX_PROFILE_HEADERS ../*.h)
2323

2424
set(CTX_PROFILE_SOURCES
25-
../CtxInstrProfiling.cpp)
25+
../CtxInstrProfiling.cpp
26+
../RootAutoDetector.cpp)
2627

2728
set(CTX_PROFILE_UNITTESTS
2829
CtxInstrProfilingTest.cpp
30+
RootAutoDetectorTest.cpp
2931
driver.cpp)
3032

3133
include_directories(../../../include)
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#include "../RootAutoDetector.h"
2+
#include "sanitizer_common/sanitizer_array_ref.h"
3+
#include "gmock/gmock.h"
4+
#include "gtest/gtest.h"
5+
6+
using namespace __ctx_profile;
7+
using ::testing::IsEmpty;
8+
using ::testing::Not;
9+
using ::testing::SizeIs;
10+
11+
// Utility for describing a preorder traversal. By default it captures the
12+
// address and count at a callsite node. Implicitly nodes are expected to have 1
13+
// child. If they have none, we place a Marker::term and if they have more than
14+
// one, we place a Marker::split(nr_of_children) For example, using a list
15+
// notation, and letters to denote a pair of address and count:
16+
// (A (B C) (D (E F))) is a list of markers: A, split(2), B, term, C,
17+
// term, D, split(2), E, term, F, term
18+
class Marker {
19+
enum class Kind { End, Value, Split };
20+
const uptr Value;
21+
const uptr Count;
22+
const Kind K;
23+
Marker(uptr V, uptr C, Kind S) : Value(V), Count(C), K(S) {}
24+
25+
public:
26+
Marker(uptr V, uptr C) : Marker(V, C, Kind::Value) {}
27+
28+
static Marker split(uptr V) { return Marker(V, 0, Kind::Split); }
29+
static Marker term() { return Marker(0, 0, Kind::End); }
30+
31+
bool isSplit() const { return K == Kind::Split; }
32+
bool isTerm() const { return K == Kind::End; }
33+
bool isVal() const { return K == Kind::Value; }
34+
35+
bool operator==(const Marker &M) const {
36+
return Value == M.Value && Count == M.Count && K == M.K;
37+
}
38+
};
39+
40+
class MockCallsiteTrie final : public PerThreadCallsiteTrie {
41+
// Return the first multiple of 100.
42+
uptr getFctStartAddr(uptr CallsiteAddress) const override {
43+
return (CallsiteAddress / 100) * 100;
44+
}
45+
46+
static void popAndCheck(ArrayRef<Marker> &Preorder, Marker M) {
47+
ASSERT_THAT(Preorder, Not(IsEmpty()));
48+
ASSERT_EQ(Preorder[0], M);
49+
Preorder = Preorder.drop_front();
50+
}
51+
52+
static void checkSameImpl(const Trie &T, ArrayRef<Marker> &Preorder) {
53+
popAndCheck(Preorder, {T.CallsiteAddress, T.Count});
54+
55+
if (T.Children.empty()) {
56+
popAndCheck(Preorder, Marker::term());
57+
return;
58+
}
59+
60+
if (T.Children.size() > 1)
61+
popAndCheck(Preorder, Marker::split(T.Children.size()));
62+
63+
T.Children.forEach([&](const auto &KVP) {
64+
checkSameImpl(KVP.second, Preorder);
65+
return true;
66+
});
67+
}
68+
69+
public:
70+
void checkSame(ArrayRef<Marker> Preorder) const {
71+
checkSameImpl(TheTrie, Preorder);
72+
ASSERT_THAT(Preorder, IsEmpty());
73+
}
74+
};
75+
76+
TEST(PerThreadCallsiteTrieTest, Insert) {
77+
MockCallsiteTrie R;
78+
uptr Stack1[]{4, 3, 2, 1};
79+
R.insertStack(StackTrace(Stack1, 4));
80+
R.checkSame(ArrayRef<Marker>(
81+
{{0, 1}, {1, 1}, {2, 1}, {3, 1}, {4, 1}, Marker::term()}));
82+
83+
uptr Stack2[]{5, 4, 3, 2, 1};
84+
R.insertStack(StackTrace(Stack2, 5));
85+
R.checkSame(ArrayRef<Marker>(
86+
{{0, 2}, {1, 2}, {2, 2}, {3, 2}, {4, 2}, {5, 1}, Marker::term()}));
87+
88+
uptr Stack3[]{6, 3, 2, 1};
89+
R.insertStack(StackTrace(Stack3, 4));
90+
R.checkSame(ArrayRef<Marker>({{0, 3},
91+
{1, 3},
92+
{2, 3},
93+
{3, 3},
94+
Marker::split(2),
95+
{4, 2},
96+
{5, 1},
97+
Marker::term(),
98+
{6, 1},
99+
Marker::term()}));
100+
uptr Stack4[]{7, 2, 1};
101+
R.insertStack(StackTrace(Stack4, 3));
102+
R.checkSame(ArrayRef<Marker>({{0, 4},
103+
{1, 4},
104+
{2, 4},
105+
Marker::split(2),
106+
{7, 1},
107+
Marker::term(),
108+
{3, 3},
109+
Marker::split(2),
110+
{4, 2},
111+
{5, 1},
112+
Marker::term(),
113+
{6, 1},
114+
Marker::term()}));
115+
}
116+
117+
TEST(PerThreadCallsiteTrieTest, DetectRoots) {
118+
MockCallsiteTrie T;
119+
120+
uptr Stack1[]{501, 302, 202, 102};
121+
uptr Stack2[]{601, 402, 203, 102};
122+
T.insertStack({Stack1, 4});
123+
T.insertStack({Stack2, 4});
124+
125+
auto R = T.determineRoots();
126+
EXPECT_THAT(R, SizeIs(2U));
127+
EXPECT_TRUE(R.contains(300));
128+
EXPECT_TRUE(R.contains(400));
129+
}
130+
131+
TEST(PerThreadCallsiteTrieTest, DetectRootsNoBranches) {
132+
MockCallsiteTrie T;
133+
134+
uptr Stack1[]{501, 302, 202, 102};
135+
T.insertStack({Stack1, 4});
136+
137+
auto R = T.determineRoots();
138+
EXPECT_THAT(R, IsEmpty());
139+
}
140+
141+
TEST(PerThreadCallsiteTrieTest, DetectRootsUnknownFct) {
142+
MockCallsiteTrie T;
143+
144+
uptr Stack1[]{501, 302, 202, 102};
145+
// The MockCallsiteTree address resolver resolves addresses over 100, so 40
146+
// will be mapped to 0.
147+
uptr Stack2[]{601, 40, 203, 102};
148+
T.insertStack({Stack1, 4});
149+
T.insertStack({Stack2, 4});
150+
151+
auto R = T.determineRoots();
152+
ASSERT_THAT(R, SizeIs(2U));
153+
EXPECT_TRUE(R.contains(300));
154+
EXPECT_TRUE(R.contains(0));
155+
}

0 commit comments

Comments
 (0)