Skip to content

Commit b9d40a7

Browse files
authored
[llvm-profgen] Improve sample profile density (#92144)
The profile density feature(the amount of samples in the profile relative to the program size) is used to identify insufficient sample issue and provide hints for user to increase sample count. A low-density profile can be inaccurate due to statistical noise, which can hurt FDO performance. This change introduces two improvements to the current density work. 1. The density calculation/definition is changed. Previously, the density of a profile was calculated as the minimum density for all warm functions (a function was considered warm if its total samples were within the top N percent of the profile). However, there is a problem that a high total sample profile can have a very low density, which makes the density value unstable. - Instead, we want to find a density number such that if a function's density is below this value, it is considered low-density function. We consider the whole profile is bad if a group of low-density functions have the sum of samples that exceeds N percent cut-off of the total samples. - In implementation, we sort the function profiles by density, iterate them in descending order and keep accumulating the body samples until the sum exceeds the (100% - N) percentage of the total_samples, the profile-density is the last(minimum) function-density of processed functions. We introduce the a flag(`--profile-density-threshold`) for this percentage threshold. 2. The density is now calculated based on final(compiler used) profiles instead of merged context-less profiles.
1 parent 66b5f16 commit b9d40a7

File tree

3 files changed

+120
-52
lines changed

3 files changed

+120
-52
lines changed

llvm/test/tools/llvm-profgen/profile-density.test

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
1-
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -hot-function-density-threshold=10 --trim-cold-profile=0 &> %t2
1+
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -profile-density-threshold=10 --trim-cold-profile=0 &> %t2
22
; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-DENSITY
3-
4-
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -hot-function-density-threshold=1 &> %t4
3+
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -profile-density-threshold=1 -profile-density-threshold=10000 &> %t4
54
; RUN: FileCheck %s --input-file %t4 --check-prefix=CHECK-DENSITY-CS
5+
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t5 --show-density -profile-density-threshold=1 -profile-density-cutoff-hot=800000 &> %t6
6+
; RUN: FileCheck %s --input-file %t6 --check-prefix=CHECK-DENSITY-CS-80
7+
8+
;CHECK-DENSITY: Sample PGO is estimated to optimize better with 2.9x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
9+
;CHECK-DENSITY: Functions with density >= 3.5 account for 99.00% total sample counts.
610

7-
;CHECK-DENSITY: Sample PGO is estimated to optimize better with 3.1x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
8-
;CHECK-DENSITY: Minimum profile density for hot functions with top 99.00% total samples: 3.2
11+
;CHECK-DENSITY-CS: Sample PGO is estimated to optimize better with 12.5x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
12+
;CHECK-DENSITY-CS: Functions with density >= 800.1 account for 99.00% total sample counts.
913

10-
;CHECK-DENSITY-CS: Minimum profile density for hot functions with top 99.00% total samples: 128.3
14+
;CHECK-DENSITY-CS-80: Functions with density >= 1886.2 account for 80.00% total sample counts.
1115

1216
; original code:
1317
; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out

llvm/tools/llvm-profgen/ProfileGenerator.cpp

Lines changed: 104 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,18 @@ static cl::opt<int, true> CSProfMaxContextDepth(
7575
"depth limit."),
7676
cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
7777

78-
static cl::opt<double> HotFunctionDensityThreshold(
79-
"hot-function-density-threshold", llvm::cl::init(1000),
80-
llvm::cl::desc(
81-
"specify density threshold for hot functions (default: 1000)"),
78+
static cl::opt<double> ProfileDensityThreshold(
79+
"profile-density-threshold", llvm::cl::init(50),
80+
llvm::cl::desc("If the profile density is below the given threshold, it "
81+
"will be suggested to increase the sampling rate."),
8282
llvm::cl::Optional);
8383
static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
8484
llvm::cl::desc("show profile density details"),
8585
llvm::cl::Optional);
86+
static cl::opt<int> ProfileDensityCutOffHot(
87+
"profile-density-cutoff-hot", llvm::cl::init(990000),
88+
llvm::cl::desc("Total samples cutoff for functions used to calculate "
89+
"profile density."));
8690

8791
static cl::opt<bool> UpdateTotalSamples(
8892
"update-total-samples", llvm::cl::init(false),
@@ -179,21 +183,22 @@ void ProfileGeneratorBase::write() {
179183

180184
void ProfileGeneratorBase::showDensitySuggestion(double Density) {
181185
if (Density == 0.0)
182-
WithColor::warning() << "The --profile-summary-cutoff-hot option may be "
186+
WithColor::warning() << "The output profile is empty or the "
187+
"--profile-density-cutoff-hot option is "
183188
"set too low. Please check your command.\n";
184-
else if (Density < HotFunctionDensityThreshold)
189+
else if (Density < ProfileDensityThreshold)
185190
WithColor::warning()
186191
<< "Sample PGO is estimated to optimize better with "
187-
<< format("%.1f", HotFunctionDensityThreshold / Density)
192+
<< format("%.1f", ProfileDensityThreshold / Density)
188193
<< "x more samples. Please consider increasing sampling rate or "
189194
"profiling for longer duration to get more samples.\n";
190195

191196
if (ShowDensity)
192-
outs() << "Minimum profile density for hot functions with top "
197+
outs() << "Functions with density >= " << format("%.1f", Density)
198+
<< " account for "
193199
<< format("%.2f",
194-
static_cast<double>(ProfileSummaryCutoffHot.getValue()) /
195-
10000)
196-
<< "% total samples: " << format("%.1f", Density) << "\n";
200+
static_cast<double>(ProfileDensityCutOffHot) / 10000)
201+
<< "% total sample counts.\n";
197202
}
198203

199204
bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
@@ -238,32 +243,6 @@ void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
238243
}
239244
}
240245

241-
double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles,
242-
uint64_t HotCntThreshold) {
243-
double Density = DBL_MAX;
244-
std::vector<const FunctionSamples *> HotFuncs;
245-
for (auto &I : Profiles) {
246-
auto &FuncSamples = I.second;
247-
if (FuncSamples.getTotalSamples() < HotCntThreshold)
248-
continue;
249-
HotFuncs.emplace_back(&FuncSamples);
250-
}
251-
252-
for (auto *FuncSamples : HotFuncs) {
253-
auto *Func = Binary->getBinaryFunction(FuncSamples->getFunction());
254-
if (!Func)
255-
continue;
256-
uint64_t FuncSize = Func->getFuncSize();
257-
if (FuncSize == 0)
258-
continue;
259-
Density =
260-
std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) /
261-
FuncSize);
262-
}
263-
264-
return Density == DBL_MAX ? 0.0 : Density;
265-
}
266-
267246
void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
268247
const RangeSample &Ranges) {
269248

@@ -768,9 +747,95 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
768747
}
769748
}
770749

750+
void ProfileGeneratorBase::calculateBodySamplesAndSize(
751+
const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
752+
uint64_t &FuncBodySize) {
753+
// Note that ideally the size should be the number of function instruction.
754+
// However, for probe-based profile, we don't have the accurate instruction
755+
// count for each probe, instead, the probe sample is the samples count for
756+
// the block, which is equivelant to
757+
// total_instruction_samples/num_of_instruction in one block. Hence, we use
758+
// the number of probe as a proxy for the function's size.
759+
FuncBodySize += FSamples.getBodySamples().size();
760+
761+
// The accumulated body samples re-calculated here could be different from the
762+
// TotalSamples(getTotalSamples) field of FunctionSamples for line-number
763+
// based profile. The reason is that TotalSamples is the sum of all the
764+
// samples of the machine instruction in one source-code line, however, the
765+
// entry of Bodysamples is the only max number of them, so the TotalSamples is
766+
// usually much bigger than the accumulated body samples as one souce-code
767+
// line can emit many machine instructions. We observed a regression when we
768+
// switched to use the accumulated body samples(by using
769+
// -update-total-samples). Hence, it's safer to re-calculate here to avoid
770+
// such discrepancy. There is no problem for probe-based profile, as the
771+
// TotalSamples is exactly the same as the accumulated body samples.
772+
for (const auto &I : FSamples.getBodySamples())
773+
TotalBodySamples += I.second.getSamples();
774+
775+
for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
776+
for (const auto &Callee : CallsiteSamples.second) {
777+
// For binary-level density, the inlinees' samples and size should be
778+
// included in the calculation.
779+
calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
780+
FuncBodySize);
781+
}
782+
}
783+
784+
// Calculate Profile-density:
785+
// Calculate the density for each function and sort them in descending order,
786+
// keep accumulating their total samples unitl it exceeds the
787+
// percentage_threshold(cut-off) of total profile samples, the profile-density
788+
// is the last(minimum) function-density of the processed functions, which means
789+
// all the functions hot to perf are on good density if the profile-density is
790+
// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
791+
// depending on how much regression the system want to tolerate.
792+
double
793+
ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
794+
double ProfileDensity = 0.0;
795+
796+
uint64_t TotalProfileSamples = 0;
797+
// A list of the function profile density and its total samples.
798+
std::vector<std::pair<double, uint64_t>> FuncDensityList;
799+
for (const auto &I : Profiles) {
800+
uint64_t TotalBodySamples = 0;
801+
uint64_t FuncBodySize = 0;
802+
calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize);
803+
804+
if (FuncBodySize == 0)
805+
continue;
806+
807+
double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
808+
TotalProfileSamples += TotalBodySamples;
809+
FuncDensityList.emplace_back(FuncDensity, TotalBodySamples);
810+
}
811+
812+
// Sorted by the density in descending order.
813+
llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A,
814+
const std::pair<double, uint64_t> &B) {
815+
if (A.first != B.first)
816+
return A.first > B.first;
817+
return A.second < B.second;
818+
});
819+
820+
uint64_t AccumulatedSamples = 0;
821+
uint32_t I = 0;
822+
assert(ProfileDensityCutOffHot <= 1000000 &&
823+
"The cutoff value is greater than 1000000(100%)");
824+
while (AccumulatedSamples < TotalProfileSamples *
825+
static_cast<float>(ProfileDensityCutOffHot) /
826+
1000000 &&
827+
I < FuncDensityList.size()) {
828+
AccumulatedSamples += FuncDensityList[I].second;
829+
ProfileDensity = FuncDensityList[I].first;
830+
I++;
831+
}
832+
833+
return ProfileDensity;
834+
}
835+
771836
void ProfileGeneratorBase::calculateAndShowDensity(
772837
const SampleProfileMap &Profiles) {
773-
double Density = calculateDensity(Profiles, HotCountThreshold);
838+
double Density = calculateDensity(Profiles);
774839
showDensitySuggestion(Density);
775840
}
776841

@@ -1057,17 +1122,13 @@ void CSProfileGenerator::postProcessProfiles() {
10571122
CSProfMaxColdContextDepth, EnableCSPreInliner);
10581123
}
10591124

1060-
// Merge function samples of CS profile to calculate profile density.
1061-
sampleprof::SampleProfileMap ContextLessProfiles;
1062-
ProfileConverter::flattenProfile(ProfileMap, ContextLessProfiles, true);
1063-
1064-
calculateAndShowDensity(ContextLessProfiles);
10651125
if (GenCSNestedProfile) {
10661126
ProfileConverter CSConverter(ProfileMap);
10671127
CSConverter.convertCSProfiles();
10681128
FunctionSamples::ProfileIsCS = false;
10691129
}
10701130
filterAmbiguousProfile(ProfileMap);
1131+
ProfileGeneratorBase::calculateAndShowDensity(ProfileMap);
10711132
}
10721133

10731134
void ProfileGeneratorBase::computeSummaryAndThreshold(

llvm/tools/llvm-profgen/ProfileGenerator.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,13 @@ class ProfileGeneratorBase {
116116

117117
void computeSummaryAndThreshold(SampleProfileMap &ProfileMap);
118118

119-
void calculateAndShowDensity(const SampleProfileMap &Profiles);
119+
void calculateBodySamplesAndSize(const FunctionSamples &FSamples,
120+
uint64_t &TotalBodySamples,
121+
uint64_t &FuncBodySize);
122+
123+
double calculateDensity(const SampleProfileMap &Profiles);
120124

121-
double calculateDensity(const SampleProfileMap &Profiles,
122-
uint64_t HotCntThreshold);
125+
void calculateAndShowDensity(const SampleProfileMap &Profiles);
123126

124127
void showDensitySuggestion(double Density);
125128

0 commit comments

Comments
 (0)