Skip to content

[CodeLayout] Faster basic block reordering, ext-tsp #68617

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 66 additions & 53 deletions llvm/lib/Transforms/Utils/CodeLayout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,20 +101,19 @@ static cl::opt<unsigned> BackwardDistance(
// The maximum size of a chain created by the algorithm. The size is bounded
// so that the algorithm can efficiently process extremely large instances.
static cl::opt<unsigned>
MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(4096),
cl::desc("The maximum size of a chain to create."));
MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(512),
cl::desc("The maximum size of a chain to create"));

// The maximum size of a chain for splitting. Larger values of the threshold
// may yield better quality at the cost of worsen run-time.
static cl::opt<unsigned> ChainSplitThreshold(
"ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
cl::desc("The maximum size of a chain to apply splitting"));

// The option enables splitting (large) chains along in-coming and out-going
// jumps. This typically results in a better quality.
static cl::opt<bool> EnableChainSplitAlongJumps(
"ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true),
cl::desc("The maximum size of a chain to apply splitting"));
// The maximum ratio between densities of two chains for merging.
static cl::opt<double> MaxMergeDensityRatio(
"ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(100),
cl::desc("The maximum ratio between densities of two chains for merging"));

// Algorithm-specific options for CDS.
static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden,
Expand Down Expand Up @@ -226,6 +225,9 @@ struct NodeT {

bool isEntry() const { return Index == 0; }

// Check if Other is a successor of the node.
bool isSuccessor(const NodeT *Other) const;

// The total execution count of outgoing jumps.
uint64_t outCount() const;

Expand Down Expand Up @@ -289,7 +291,7 @@ struct ChainT {

size_t numBlocks() const { return Nodes.size(); }

double density() const { return static_cast<double>(ExecutionCount) / Size; }
double density() const { return ExecutionCount / Size; }

bool isEntry() const { return Nodes[0]->Index == 0; }

Expand Down Expand Up @@ -350,8 +352,9 @@ struct ChainT {
uint64_t Id;
// Cached ext-tsp score for the chain.
double Score{0};
// The total execution count of the chain.
uint64_t ExecutionCount{0};
// The total execution count of the chain. Since the execution count of
// a basic block is uint64_t, using doubles here to avoid overflow.
double ExecutionCount{0};
// The total size of the chain.
uint64_t Size{0};
// Nodes of the chain.
Expand Down Expand Up @@ -446,6 +449,13 @@ struct ChainEdge {
bool CacheValidBackward{false};
};

bool NodeT::isSuccessor(const NodeT *Other) const {
for (JumpT *Jump : OutJumps)
if (Jump->Target == Other)
return true;
return false;
}

uint64_t NodeT::outCount() const {
uint64_t Count = 0;
for (JumpT *Jump : OutJumps)
Expand Down Expand Up @@ -514,8 +524,6 @@ struct MergedNodesT {

const NodeT *getFirstNode() const { return *Begin1; }

bool empty() const { return Begin1 == End1; }

private:
NodeIter Begin1;
NodeIter End1;
Expand Down Expand Up @@ -639,7 +647,8 @@ class ExtTSPImpl {
}
}
for (JumpT &Jump : AllJumps) {
assert(OutDegree[Jump.Source->Index] > 0);
assert(OutDegree[Jump.Source->Index] > 0 &&
"incorrectly computed out-degree of the block");
Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
}

Expand Down Expand Up @@ -741,12 +750,23 @@ class ExtTSPImpl {
// Get candidates for merging with the current chain.
for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
// Ignore loop edges.
if (ChainPred == ChainSucc)
if (Edge->isSelfEdge())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: This is often called self-loop or loop. If you think fine, I'll push a NFC commit renaming isSelfEdge to isSelfLoop...

continue;

// Stop early if the combined chain violates the maximum allowed size.
// Skip the merge if the combined chain violates the maximum specified
// size.
if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
continue;
// Don't merge the chains if they have vastly different densities.
// Skip the merge if the ratio between the densities exceeds
// MaxMergeDensityRatio. Smaller values of the option result in fewer
// merges, and hence, more chains.
auto [minDensity, maxDensity] =
std::minmax(ChainPred->density(), ChainSucc->density());
assert(minDensity > 0.0 && maxDensity > 0.0 &&
"incorrectly computed chain densities");
const double Ratio = maxDensity / minDensity;
if (Ratio > MaxMergeDensityRatio)
continue;

// Compute the gain of merging the two chains.
MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
Expand Down Expand Up @@ -858,36 +878,42 @@ class ExtTSPImpl {
Gain.updateIfLessThan(
computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));

if (EnableChainSplitAlongJumps) {
// Attach (a part of) ChainPred before the first node of ChainSucc.
for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
const NodeT *SrcBlock = Jump->Source;
if (SrcBlock->CurChain != ChainPred)
continue;
size_t Offset = SrcBlock->CurIndex + 1;
tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
}
// Attach (a part of) ChainPred before the first node of ChainSucc.
for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
const NodeT *SrcBlock = Jump->Source;
if (SrcBlock->CurChain != ChainPred)
continue;
size_t Offset = SrcBlock->CurIndex + 1;
tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
}

// Attach (a part of) ChainPred after the last node of ChainSucc.
for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
const NodeT *DstBlock = Jump->Target;
if (DstBlock->CurChain != ChainPred)
continue;
size_t Offset = DstBlock->CurIndex;
tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
}
// Attach (a part of) ChainPred after the last node of ChainSucc.
for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
const NodeT *DstBlock = Jump->Target;
if (DstBlock->CurChain != ChainPred)
continue;
size_t Offset = DstBlock->CurIndex;
tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
}

// Try to break ChainPred in various ways and concatenate with ChainSucc.
if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
// Try to split the chain in different ways. In practice, applying
// X2_Y_X1 merging is almost never provides benefits; thus, we exclude
// it from consideration to reduce the search space.
// Do not split the chain along a fall-through jump. One of the two
// loops above may still "break" such a jump whenever it results in a
// new fall-through.
const NodeT *BB = ChainPred->Nodes[Offset - 1];
const NodeT *BB2 = ChainPred->Nodes[Offset];
if (BB->isSuccessor(BB2))
continue;

// In practice, applying X2_Y_X1 merging almost never provides benefits;
// thus, we exclude it from consideration to reduce the search space.
tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
MergeTypeT::X2_X1_Y});
}
}

Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
return Gain;
}
Expand Down Expand Up @@ -946,22 +972,11 @@ class ExtTSPImpl {

/// Concatenate all chains into the final order.
std::vector<uint64_t> concatChains() {
// Collect chains and calculate density stats for their sorting.
// Collect non-empty chains.
std::vector<const ChainT *> SortedChains;
DenseMap<const ChainT *, double> ChainDensity;
for (ChainT &Chain : AllChains) {
if (!Chain.Nodes.empty()) {
if (!Chain.Nodes.empty())
SortedChains.push_back(&Chain);
// Using doubles to avoid overflow of ExecutionCounts.
double Size = 0;
double ExecutionCount = 0;
for (NodeT *Node : Chain.Nodes) {
Size += static_cast<double>(Node->Size);
ExecutionCount += static_cast<double>(Node->ExecutionCount);
}
assert(Size > 0 && "a chain of zero size");
ChainDensity[&Chain] = ExecutionCount / Size;
}
}

// Sorting chains by density in the decreasing order.
Expand All @@ -971,11 +986,9 @@ class ExtTSPImpl {
if (L->isEntry() != R->isEntry())
return L->isEntry();

const double DL = ChainDensity[L];
const double DR = ChainDensity[R];
// Compare by density and break ties by chain identifiers.
return std::make_tuple(-DL, L->Id) <
std::make_tuple(-DR, R->Id);
return std::make_tuple(-L->density(), L->Id) <
std::make_tuple(-R->density(), R->Id);
});

// Collect the nodes in the order specified by their chains.
Expand Down
13 changes: 2 additions & 11 deletions llvm/test/CodeGen/X86/code_placement_ext_tsp.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
;; See also llvm/unittests/Transforms/Utils/CodeLayoutTest.cpp
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 < %s | FileCheck %s
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=0 -ext-tsp-enable-chain-split-along-jumps=0 < %s | FileCheck %s -check-prefix=CHECK2

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is the -ext-tsp-chain-split-threshold test removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test relies on another option, ext-tsp-enable-chain-split-along-jumps, which is now deleted.

define void @func1a() {
; Test that the algorithm positions the most likely successor first
Expand Down Expand Up @@ -329,8 +328,8 @@ end:
}

define void @func4() !prof !11 {
; Test verifying that, if enabled, chains can be split in order to improve the
; objective (by creating more fallthroughs)
; Test verifying that chains can be split in order to improve the objective
; by creating more fallthroughs
;
; +-------+
; | entry |--------+
Expand All @@ -354,19 +353,11 @@ define void @func4() !prof !11 {
; | b2 | <+ ----+
; +-------+
;
; With chain splitting enabled:
; CHECK-LABEL: func4:
; CHECK: entry
; CHECK: b1
; CHECK: b3
; CHECK: b2
;
; With chain splitting disabled:
; CHECK2-LABEL: func4:
; CHECK2: entry
; CHECK2: b1
; CHECK2: b2
; CHECK2: b3

entry:
call void @b()
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
@yydebug = dso_local global i32 0, align 4

define void @func_large() !prof !0 {
; A largee CFG instance where chain splitting helps to
; A large CFG instance where chain splitting helps to
; compute a better basic block ordering. The test verifies that with chain
; splitting, the resulting layout is improved (e.g., the score is increased).
;
Expand Down