Skip to content

Commit 9a9af0a

Browse files
shawbyoungaaupov
andauthored
[BOLT] Match blocks with pseudo probes (#99891)
Match inline trees first between profile and the binary: by GUID, checksum, parent, and inline site for inlined functions. Map profile probes to binary probes via matched inline tree nodes. Each binary probe has an associated binary basic block. If all probes from one profile basic block map to the same binary basic block, it’s an exact match, otherwise the block is determined by majority vote and reported as loose match. Pseudo probe matching happens between exact hash matching and call/loose matching. Introduce ProbeMatchSpec - a mechanism to match probes belonging to another binary function. For example, given functions foo and bar: ``` void foo() { bar(); } ``` profiled binary: bar is not inlined => have top-level function bar new binary where the profile is applied to: bar is inlined into foo. Currently, BOLT does 1:1 matching between profile functions and binary functions based on the name. #100446 will extend this to N:M where multiple profiles can be matched to one binary function (as in the example above where binary function foo would use profiles for foo and bar), and one profile can be matched to multiple binary functions (e.g. if bar was inlined into multiple functions). In this diff, ProbeMatchSpecs would only have one BinaryFunctionProfile (existing name-based matching). Test Plan: Added match-blocks-with-pseudo-probes.test Performance test: - Setup: - Baseline no-BOLT: Clang with pseudo probes, ThinLTO + CSSPGO (#79942) - BOLT fresh: BOLTed Clang using fresh profile, - BOLT stale (hash): BOLTed Clang using stale profile (collected on Clang 10K commits back), `-infer-stale-profile` (hash+call block matching) - BOLT stale (+probe): BOLTed Clang using stale profile, `-infer-stale-profile` with `-stale-matching-with-pseudo-probes` (hash+call+pseudo probe block matching) - 2S Intel SKX Xeon 6138 with 40C/80T and 256GB RAM, using 20C/40T for build, - BOLT profiles are collected on Clang compiling large preprocessed C++ file. - Benchmark: building Clang (average of 5 runs), see driver in aaupov/llvm-devmtg-2022 - Results, wall time, lower is better: - Baseline no-BOLT: 429.52 +- 2.61s, - BOLT stale (hash): 413.21 +- 2.19s, - BOLT stale (+probe): 409.69 +- 1.41s, - BOLT fresh: 384.50 +- 1.80s. --------- Co-authored-by: Amir Ayupov <[email protected]>
1 parent 8888352 commit 9a9af0a

11 files changed

+664
-98
lines changed

bolt/include/bolt/Core/BinaryContext.h

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -723,12 +723,28 @@ class BinaryContext {
723723
/// Stats for stale profile matching:
724724
/// the total number of basic blocks in the profile
725725
uint32_t NumStaleBlocks{0};
726-
/// the number of matched basic blocks
727-
uint32_t NumMatchedBlocks{0};
726+
/// the number of exactly matched basic blocks
727+
uint32_t NumExactMatchedBlocks{0};
728+
/// the number of loosely matched basic blocks
729+
uint32_t NumLooseMatchedBlocks{0};
730+
/// the number of exactly pseudo probe matched basic blocks
731+
uint32_t NumPseudoProbeExactMatchedBlocks{0};
732+
/// the number of loosely pseudo probe matched basic blocks
733+
uint32_t NumPseudoProbeLooseMatchedBlocks{0};
734+
/// the number of call matched basic blocks
735+
uint32_t NumCallMatchedBlocks{0};
728736
/// the total count of samples in the profile
729737
uint64_t StaleSampleCount{0};
730-
/// the count of matched samples
731-
uint64_t MatchedSampleCount{0};
738+
/// the count of exactly matched samples
739+
uint64_t ExactMatchedSampleCount{0};
740+
/// the count of loosely matched samples
741+
uint64_t LooseMatchedSampleCount{0};
742+
/// the count of exactly pseudo probe matched samples
743+
uint64_t PseudoProbeExactMatchedSampleCount{0};
744+
/// the count of loosely pseudo probe matched samples
745+
uint64_t PseudoProbeLooseMatchedSampleCount{0};
746+
/// the count of call matched samples
747+
uint64_t CallMatchedSampleCount{0};
732748
/// the number of stale functions that have matching number of blocks in
733749
/// the profile
734750
uint64_t NumStaleFuncsWithEqualBlockCount{0};

bolt/include/bolt/Profile/ProfileYAMLMapping.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,9 @@ struct InlineTreeNode {
174174
uint32_t CallSiteProbe;
175175
// Index in PseudoProbeDesc.GUID, UINT32_MAX for same as previous (omitted)
176176
uint32_t GUIDIndex;
177+
// Decoded contents, ParentIndexDelta becomes absolute value.
178+
uint64_t GUID;
179+
uint64_t Hash;
177180
bool operator==(const InlineTreeNode &) const { return false; }
178181
};
179182
} // end namespace bolt

bolt/include/bolt/Profile/YAMLProfileReader.h

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
#include <unordered_set>
1515

1616
namespace llvm {
17+
class MCDecodedPseudoProbeInlineTree;
18+
1719
namespace bolt {
1820

1921
class YAMLProfileReader : public ProfileReaderBase {
@@ -43,6 +45,9 @@ class YAMLProfileReader : public ProfileReaderBase {
4345
using ProfileLookupMap =
4446
DenseMap<uint32_t, yaml::bolt::BinaryFunctionProfile *>;
4547

48+
using GUIDInlineTreeMap =
49+
std::unordered_map<uint64_t, const MCDecodedPseudoProbeInlineTree *>;
50+
4651
/// A class for matching binary functions in functions in the YAML profile.
4752
/// First, a call graph is constructed for both profiled and binary functions.
4853
/// Then functions are hashed based on the names of their callee/caller
@@ -96,6 +101,61 @@ class YAMLProfileReader : public ProfileReaderBase {
96101
YamlBFAdjacencyMap;
97102
};
98103

104+
// A class for matching inline tree nodes between profile and binary.
105+
// Provides the mapping from profile inline tree node id to a
106+
// corresponding binary MCDecodedPseudoProbeInlineTree node.
107+
//
108+
// The whole mapping process is the following:
109+
//
110+
// (profile) (binary)
111+
// | blocks ^
112+
// v |
113+
// yaml::bolt::BinaryBasicBlockProfile ~= FlowBlock
114+
// ||| probes ^ (majority vote)
115+
// v ||| BBPseudoProbeToBlock
116+
// yaml::bolt::PseudoProbeInfo MCDecodedPseudoProbe
117+
// | InlineTreeIndex ^
118+
// v | probe id
119+
// [ profile node id (uint32_t) -> MCDecodedPseudoProbeInlineTree *]
120+
// InlineTreeNodeMapTy
121+
class InlineTreeNodeMapTy {
122+
DenseMap<uint32_t, const MCDecodedPseudoProbeInlineTree *> Map;
123+
124+
void mapInlineTreeNode(uint32_t ProfileNodeIdx,
125+
const MCDecodedPseudoProbeInlineTree *BinaryNode) {
126+
auto Res = Map.try_emplace(ProfileNodeIdx, BinaryNode);
127+
assert(Res.second &&
128+
"Duplicate mapping from profile node index to binary inline tree");
129+
(void)Res;
130+
}
131+
132+
public:
133+
/// Returns matched InlineTree * for a given profile inline_tree_id.
134+
const MCDecodedPseudoProbeInlineTree *
135+
getInlineTreeNode(uint32_t ProfileInlineTreeNodeId) const {
136+
auto It = Map.find(ProfileInlineTreeNodeId);
137+
if (It == Map.end())
138+
return nullptr;
139+
return It->second;
140+
}
141+
142+
// Match up \p YamlInlineTree with binary inline tree rooted at \p Root.
143+
// Return the number of matched nodes.
144+
//
145+
// This function populates the mapping from profile inline tree node id to a
146+
// corresponding binary MCDecodedPseudoProbeInlineTree node.
147+
size_t matchInlineTrees(
148+
const MCPseudoProbeDecoder &Decoder,
149+
const std::vector<yaml::bolt::InlineTreeNode> &YamlInlineTree,
150+
const MCDecodedPseudoProbeInlineTree *Root);
151+
};
152+
153+
// Partial probe matching specification: matched inline tree and corresponding
154+
// BinaryFunctionProfile
155+
using ProbeMatchSpec =
156+
std::pair<InlineTreeNodeMapTy,
157+
std::reference_wrapper<yaml::bolt::BinaryFunctionProfile>>;
158+
99159
private:
100160
/// Adjustments for basic samples profiles (without LBR).
101161
bool NormalizeByInsnCount{false};
@@ -129,6 +189,13 @@ class YAMLProfileReader : public ProfileReaderBase {
129189
/// BinaryFunction pointers indexed by YamlBP functions.
130190
std::vector<BinaryFunction *> ProfileBFs;
131191

192+
// Pseudo probe function GUID to inline tree node
193+
GUIDInlineTreeMap TopLevelGUIDToInlineTree;
194+
195+
// Mapping from a binary function to its partial match specification
196+
// (YAML profile and its inline tree mapping to binary).
197+
DenseMap<BinaryFunction *, std::vector<ProbeMatchSpec>> BFToProbeMatchSpecs;
198+
132199
/// Populate \p Function profile with the one supplied in YAML format.
133200
bool parseFunctionProfile(BinaryFunction &Function,
134201
const yaml::bolt::BinaryFunctionProfile &YamlBF);
@@ -139,7 +206,8 @@ class YAMLProfileReader : public ProfileReaderBase {
139206

140207
/// Infer function profile from stale data (collected on older binaries).
141208
bool inferStaleProfile(BinaryFunction &Function,
142-
const yaml::bolt::BinaryFunctionProfile &YamlBF);
209+
const yaml::bolt::BinaryFunctionProfile &YamlBF,
210+
const ArrayRef<ProbeMatchSpec> ProbeMatchSpecs);
143211

144212
/// Initialize maps for profile matching.
145213
void buildNameMaps(BinaryContext &BC);
@@ -156,6 +224,10 @@ class YAMLProfileReader : public ProfileReaderBase {
156224
/// Matches functions using the call graph.
157225
size_t matchWithCallGraph(BinaryContext &BC);
158226

227+
/// Matches functions using the call graph.
228+
/// Populates BF->partial probe match spec map.
229+
size_t matchWithPseudoProbes(BinaryContext &BC);
230+
159231
/// Matches functions with similarly named profiled functions.
160232
size_t matchWithNameSimilarity(BinaryContext &BC);
161233

bolt/lib/Passes/BinaryPasses.cpp

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,10 +1549,48 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
15491549
"BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
15501550
" (%zu out of %zu stale) responsible for %.2f%% samples"
15511551
" (%zu out of %zu stale)\n",
1552-
100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
1553-
BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
1554-
100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
1555-
BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
1552+
100.0 * BC.Stats.NumExactMatchedBlocks / BC.Stats.NumStaleBlocks,
1553+
BC.Stats.NumExactMatchedBlocks, BC.Stats.NumStaleBlocks,
1554+
100.0 * BC.Stats.ExactMatchedSampleCount / BC.Stats.StaleSampleCount,
1555+
BC.Stats.ExactMatchedSampleCount, BC.Stats.StaleSampleCount);
1556+
BC.outs() << format(
1557+
"BOLT-INFO: inference found an exact pseudo probe match for %.2f%% of "
1558+
"basic blocks (%zu out of %zu stale) responsible for %.2f%% samples"
1559+
" (%zu out of %zu stale)\n",
1560+
100.0 * BC.Stats.NumPseudoProbeExactMatchedBlocks /
1561+
BC.Stats.NumStaleBlocks,
1562+
BC.Stats.NumPseudoProbeExactMatchedBlocks, BC.Stats.NumStaleBlocks,
1563+
100.0 * BC.Stats.PseudoProbeExactMatchedSampleCount /
1564+
BC.Stats.StaleSampleCount,
1565+
BC.Stats.PseudoProbeExactMatchedSampleCount, BC.Stats.StaleSampleCount);
1566+
BC.outs() << format(
1567+
"BOLT-INFO: inference found a loose pseudo probe match for %.2f%% of "
1568+
"basic blocks (%zu out of %zu stale) responsible for %.2f%% samples"
1569+
" (%zu out of %zu stale)\n",
1570+
100.0 * BC.Stats.NumPseudoProbeLooseMatchedBlocks /
1571+
BC.Stats.NumStaleBlocks,
1572+
BC.Stats.NumPseudoProbeLooseMatchedBlocks, BC.Stats.NumStaleBlocks,
1573+
100.0 * BC.Stats.PseudoProbeLooseMatchedSampleCount /
1574+
BC.Stats.StaleSampleCount,
1575+
BC.Stats.PseudoProbeLooseMatchedSampleCount, BC.Stats.StaleSampleCount);
1576+
BC.outs() << format(
1577+
"BOLT-INFO: inference found a call match for %.2f%% of basic "
1578+
"blocks"
1579+
" (%zu out of %zu stale) responsible for %.2f%% samples"
1580+
" (%zu out of %zu stale)\n",
1581+
100.0 * BC.Stats.NumCallMatchedBlocks / BC.Stats.NumStaleBlocks,
1582+
BC.Stats.NumCallMatchedBlocks, BC.Stats.NumStaleBlocks,
1583+
100.0 * BC.Stats.CallMatchedSampleCount / BC.Stats.StaleSampleCount,
1584+
BC.Stats.CallMatchedSampleCount, BC.Stats.StaleSampleCount);
1585+
BC.outs() << format(
1586+
"BOLT-INFO: inference found a loose match for %.2f%% of basic "
1587+
"blocks"
1588+
" (%zu out of %zu stale) responsible for %.2f%% samples"
1589+
" (%zu out of %zu stale)\n",
1590+
100.0 * BC.Stats.NumLooseMatchedBlocks / BC.Stats.NumStaleBlocks,
1591+
BC.Stats.NumLooseMatchedBlocks, BC.Stats.NumStaleBlocks,
1592+
100.0 * BC.Stats.LooseMatchedSampleCount / BC.Stats.StaleSampleCount,
1593+
BC.Stats.LooseMatchedSampleCount, BC.Stats.StaleSampleCount);
15561594
}
15571595

15581596
if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {

0 commit comments

Comments
 (0)