Skip to content

[llvm-profgen] Improve sample profile density #92144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions llvm/test/tools/llvm-profgen/profile-density.test
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -hot-function-density-threshold=10 --trim-cold-profile=0 &> %t2
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -profile-density-threshold=10 --trim-cold-profile=0 &> %t2
; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-DENSITY
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was there no test for suggestion? should we add one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the suggestion for CS profile (There was one for the non-CS profile)


; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -hot-function-density-threshold=1 &> %t4
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -profile-density-threshold=1 -profile-density-threshold=10000 &> %t4
; RUN: FileCheck %s --input-file %t4 --check-prefix=CHECK-DENSITY-CS
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t5 --show-density -profile-density-threshold=1 -profile-density-cutoff-hot=800000 &> %t6
; RUN: FileCheck %s --input-file %t6 --check-prefix=CHECK-DENSITY-CS-80

;CHECK-DENSITY: Sample PGO is estimated to optimize better with 2.9x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
;CHECK-DENSITY: Functions with density >= 3.5 account for 99.00% total sample counts.

;CHECK-DENSITY: Sample PGO is estimated to optimize better with 3.1x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
;CHECK-DENSITY: Minimum profile density for hot functions with top 99.00% total samples: 3.2
;CHECK-DENSITY-CS: Sample PGO is estimated to optimize better with 12.5x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
;CHECK-DENSITY-CS: Functions with density >= 800.1 account for 99.00% total sample counts.

;CHECK-DENSITY-CS: Minimum profile density for hot functions with top 99.00% total samples: 128.3
;CHECK-DENSITY-CS-80: Functions with density >= 1886.2 account for 80.00% total sample counts.

; original code:
; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out
Expand Down
147 changes: 104 additions & 43 deletions llvm/tools/llvm-profgen/ProfileGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,18 @@ static cl::opt<int, true> CSProfMaxContextDepth(
"depth limit."),
cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));

static cl::opt<double> HotFunctionDensityThreshold(
"hot-function-density-threshold", llvm::cl::init(1000),
llvm::cl::desc(
"specify density threshold for hot functions (default: 1000)"),
static cl::opt<double> ProfileDensityThreshold(
"profile-density-threshold", llvm::cl::init(50),
llvm::cl::desc("If the profile density is below the given threshold, it "
"will be suggested to increase the sampling rate."),
llvm::cl::Optional);
static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
llvm::cl::desc("show profile density details"),
llvm::cl::Optional);
static cl::opt<int> ProfileDensityCutOffHot(
"profile-density-cutoff-hot", llvm::cl::init(990000),
llvm::cl::desc("Total samples cutoff for functions used to calculate "
"profile density."));

static cl::opt<bool> UpdateTotalSamples(
"update-total-samples", llvm::cl::init(false),
Expand Down Expand Up @@ -179,21 +183,22 @@ void ProfileGeneratorBase::write() {

void ProfileGeneratorBase::showDensitySuggestion(double Density) {
if (Density == 0.0)
WithColor::warning() << "The --profile-summary-cutoff-hot option may be "
WithColor::warning() << "The output profile is empty or the "
"--profile-density-cutoff-hot option is "
"set too low. Please check your command.\n";
else if (Density < HotFunctionDensityThreshold)
else if (Density < ProfileDensityThreshold)
WithColor::warning()
<< "Sample PGO is estimated to optimize better with "
<< format("%.1f", HotFunctionDensityThreshold / Density)
<< format("%.1f", ProfileDensityThreshold / Density)
<< "x more samples. Please consider increasing sampling rate or "
"profiling for longer duration to get more samples.\n";

if (ShowDensity)
outs() << "Minimum profile density for hot functions with top "
outs() << "Functions with density >= " << format("%.1f", Density)
<< " account for "
<< format("%.2f",
static_cast<double>(ProfileSummaryCutoffHot.getValue()) /
10000)
<< "% total samples: " << format("%.1f", Density) << "\n";
static_cast<double>(ProfileDensityCutOffHot) / 10000)
<< "% total sample counts.\n";
}

bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
Expand Down Expand Up @@ -238,32 +243,6 @@ void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
}
}

double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles,
uint64_t HotCntThreshold) {
double Density = DBL_MAX;
std::vector<const FunctionSamples *> HotFuncs;
for (auto &I : Profiles) {
auto &FuncSamples = I.second;
if (FuncSamples.getTotalSamples() < HotCntThreshold)
continue;
HotFuncs.emplace_back(&FuncSamples);
}

for (auto *FuncSamples : HotFuncs) {
auto *Func = Binary->getBinaryFunction(FuncSamples->getFunction());
if (!Func)
continue;
uint64_t FuncSize = Func->getFuncSize();
if (FuncSize == 0)
continue;
Density =
std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) /
FuncSize);
}

return Density == DBL_MAX ? 0.0 : Density;
}

void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
const RangeSample &Ranges) {

Expand Down Expand Up @@ -768,9 +747,95 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
}
}

void ProfileGeneratorBase::calculateBodySamplesAndSize(
const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How different is this TotalBodySamples comparing to FSample.getTotalSamples() which also includes all inlinees?

Copy link
Contributor Author

@wlei-llvm wlei-llvm May 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be no difference for probe-based profile. But very different to line-number based (AutoFDO) profile, in which the entry is the max of the counts in the same line, however, the total_samples is a sum of all the counts in the same line, so the getTotalSamples is much bigger than sum of each entry. That said, if we can adjust to use the binary instruction size, it also should be good to use getTotalSamples then the density for autofdo is getTotalSamples / binary_size.

If so, we need to diverge the function size calculation:
For pseudo-probe, we use the way in this version.
for line-number, we use the binary size(binary dwarf start_address - end_address).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May still not accurate for AutoFDO to use the start_address - end_address, start_address - end_address is also not equal to the num of instruction since instruction size is not always 1.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

total_samples is a sum of all the counts in the same line

For AutoFDO, based on ProfileGeneratorBase::updateTotalSamples, isn't total sample the same as sum of all body samples + callsite (inlinee) samples?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

total_samples is a sum of all the counts in the same line

For AutoFDO, based on ProfileGeneratorBase::updateTotalSamples, isn't total sample the same as sum of all body samples + callsite (inlinee) samples?

Yes, using updateTotalSamples is equal to the sum of all body samples + callsite (inlinee) samples.. But the --update-total-samples is default off.(I remember because there is regression with this on)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though --update-total-samples is off, how big of a difference do we expect between total samples and the way you compute it? I actually don't remember what is the source of discrepency, and still don't understand why we have regression. If they are close enough, it shouldn't matter for computing density?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though --update-total-samples is off, how big of a difference do we expect between total samples and the way you compute it? I actually don't remember what is the source of discrepency, and still don't understand why we have regression. If they are close enough, it shouldn't matter for computing density?

It could be 2X~10X bigger, the multiplier is equal to how many physical instruction one source instruction(debug-line) emits, usually one source code line can generate many physical instructions, it could be big.

There is an example in our test case: https://github.com/llvm/llvm-project/blob/main/llvm/test/tools/llvm-profgen/inline-noprobe.test#L17-L45

See the CHECK: main:2609:0 vs CHECK-UPDATE-TOTAL-SAMPLE: main:292:0, it could be ~10X.

it's indeed hard to understand why it can cause regression, it's just the observation from data.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, can you add a comment explaining why we need to accumulate the samples manually instead of using getTotalSamples? including the fact that --update-total-samples is off and the reasoning, difference between AutoFDO and CSSPGO. A comment would be very useful because after a while someone may look at this code be very tempted to refactor / simplify it with getTotalSamples.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comments added

uint64_t &FuncBodySize) {
// Note that ideally the size should be the number of function instruction.
// However, for probe-based profile, we don't have the accurate instruction
// count for each probe, instead, the probe sample is the samples count for
// the block, which is equivelant to
// total_instruction_samples/num_of_instruction in one block. Hence, we use
// the number of probe as a proxy for the function's size.
FuncBodySize += FSamples.getBodySamples().size();

// The accumulated body samples re-calculated here could be different from the
// TotalSamples(getTotalSamples) field of FunctionSamples for line-number
// based profile. The reason is that TotalSamples is the sum of all the
// samples of the machine instruction in one source-code line, however, the
// entry of Bodysamples is the only max number of them, so the TotalSamples is
// usually much bigger than the accumulated body samples as one souce-code
// line can emit many machine instructions. We observed a regression when we
// switched to use the accumulated body samples(by using
// -update-total-samples). Hence, it's safer to re-calculate here to avoid
// such discrepancy. There is no problem for probe-based profile, as the
// TotalSamples is exactly the same as the accumulated body samples.
for (const auto &I : FSamples.getBodySamples())
TotalBodySamples += I.second.getSamples();

for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
for (const auto &Callee : CallsiteSamples.second) {
// For binary-level density, the inlinees' samples and size should be
// included in the calculation.
calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
FuncBodySize);
}
}

// Calculate Profile-density:
// Calculate the density for each function and sort them in descending order,
// keep accumulating their total samples unitl it exceeds the
// percentage_threshold(cut-off) of total profile samples, the profile-density
// is the last(minimum) function-density of the processed functions, which means
// all the functions hot to perf are on good density if the profile-density is
// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
// depending on how much regression the system want to tolerate.
double
ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
double ProfileDensity = 0.0;

uint64_t TotalProfileSamples = 0;
// A list of the function profile density and its total samples.
std::vector<std::pair<double, uint64_t>> FuncDensityList;
for (const auto &I : Profiles) {
uint64_t TotalBodySamples = 0;
uint64_t FuncBodySize = 0;
calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize);

if (FuncBodySize == 0)
continue;

double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
TotalProfileSamples += TotalBodySamples;
FuncDensityList.emplace_back(FuncDensity, TotalBodySamples);
}

// Sorted by the density in descending order.
llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A,
const std::pair<double, uint64_t> &B) {
if (A.first != B.first)
return A.first > B.first;
return A.second < B.second;
});

uint64_t AccumulatedSamples = 0;
uint32_t I = 0;
assert(ProfileDensityCutOffHot <= 1000000 &&
"The cutoff value is greater than 1000000(100%)");
while (AccumulatedSamples < TotalProfileSamples *
static_cast<float>(ProfileDensityCutOffHot) /
1000000 &&
I < FuncDensityList.size()) {
AccumulatedSamples += FuncDensityList[I].second;
ProfileDensity = FuncDensityList[I].first;
I++;
}

return ProfileDensity;
}

void ProfileGeneratorBase::calculateAndShowDensity(
const SampleProfileMap &Profiles) {
double Density = calculateDensity(Profiles, HotCountThreshold);
double Density = calculateDensity(Profiles);
showDensitySuggestion(Density);
}

Expand Down Expand Up @@ -1057,17 +1122,13 @@ void CSProfileGenerator::postProcessProfiles() {
CSProfMaxColdContextDepth, EnableCSPreInliner);
}

// Merge function samples of CS profile to calculate profile density.
sampleprof::SampleProfileMap ContextLessProfiles;
ProfileConverter::flattenProfile(ProfileMap, ContextLessProfiles, true);

calculateAndShowDensity(ContextLessProfiles);
if (GenCSNestedProfile) {
ProfileConverter CSConverter(ProfileMap);
CSConverter.convertCSProfiles();
FunctionSamples::ProfileIsCS = false;
}
filterAmbiguousProfile(ProfileMap);
ProfileGeneratorBase::calculateAndShowDensity(ProfileMap);
}

void ProfileGeneratorBase::computeSummaryAndThreshold(
Expand Down
9 changes: 6 additions & 3 deletions llvm/tools/llvm-profgen/ProfileGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,13 @@ class ProfileGeneratorBase {

void computeSummaryAndThreshold(SampleProfileMap &ProfileMap);

void calculateAndShowDensity(const SampleProfileMap &Profiles);
void calculateBodySamplesAndSize(const FunctionSamples &FSamples,
uint64_t &TotalBodySamples,
uint64_t &FuncBodySize);

double calculateDensity(const SampleProfileMap &Profiles);

double calculateDensity(const SampleProfileMap &Profiles,
uint64_t HotCntThreshold);
void calculateAndShowDensity(const SampleProfileMap &Profiles);

void showDensitySuggestion(double Density);

Expand Down
Loading