Skip to content

Commit 49ee606

Browse files
authored
[BOLT][AArch64] Add support for compact code model (#112110)
Add `--compact-code-model` option that executes alternative branch relaxation with an assumption that the resulting binary has less than 128MB of code. The relaxation is done in `relaxLocalBranches()`, which operates on a function level and executes on multiple functions in parallel. Running the new option on AArch64 Clang binary produces slightly smaller code and the relaxation finishes in about 1/10th of the time. Note that the new `.text` has to be smaller than 128MB, *and* `.plt` has to be closer than 128MB to `.text`.
1 parent 3b1b127 commit 49ee606

File tree

6 files changed

+399
-5
lines changed

6 files changed

+399
-5
lines changed

bolt/include/bolt/Core/BinaryBasicBlock.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,9 @@ class BinaryBasicBlock {
819819
return OutputAddressRange;
820820
}
821821

822+
uint64_t getOutputStartAddress() const { return OutputAddressRange.first; }
823+
uint64_t getOutputEndAddress() const { return OutputAddressRange.second; }
824+
822825
bool hasLocSyms() const { return LocSyms != nullptr; }
823826

824827
/// Return mapping of input offsets to symbols in the output.

bolt/include/bolt/Core/FunctionLayout.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ class FunctionFragment {
123123
const_iterator begin() const;
124124
iterator end();
125125
const_iterator end() const;
126-
const BinaryBasicBlock *front() const;
126+
BinaryBasicBlock *front() const;
127+
BinaryBasicBlock *back() const;
127128

128129
friend class FunctionLayout;
129130
};

bolt/include/bolt/Passes/LongJmp.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,19 @@ class LongJmpPass : public BinaryFunctionPass {
6363
uint32_t NumColdStubs{0};
6464
uint32_t NumSharedStubs{0};
6565

66+
/// The shortest distance for any branch instruction on AArch64.
67+
static constexpr size_t ShortestJumpBits = 16;
68+
static constexpr size_t ShortestJumpSpan = 1ULL << (ShortestJumpBits - 1);
69+
70+
/// The longest single-instruction branch.
71+
static constexpr size_t LongestJumpBits = 28;
72+
static constexpr size_t LongestJumpSpan = 1ULL << (LongestJumpBits - 1);
73+
74+
/// Relax all internal function branches including those between fragments.
75+
/// Assume that fragments are placed in different sections but are within
76+
/// 128MB of each other.
77+
void relaxLocalBranches(BinaryFunction &BF);
78+
6679
/// -- Layout estimation methods --
6780
/// Try to do layout before running the emitter, by looking at BinaryFunctions
6881
/// and MCInsts -- this is an estimation. To be correct for longjmp inserter

bolt/lib/Core/FunctionLayout.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ FunctionFragment::const_iterator FunctionFragment::end() const {
3333
return const_iterator(Layout->block_begin() + StartIndex + Size);
3434
}
3535

36-
const BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
36+
BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
37+
38+
BinaryBasicBlock *FunctionFragment::back() const { return *std::prev(end()); }
3739

3840
FunctionLayout::FunctionLayout() { addFragment(); }
3941

bolt/lib/Passes/LongJmp.cpp

Lines changed: 286 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,26 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "bolt/Passes/LongJmp.h"
14+
#include "bolt/Core/ParallelUtilities.h"
15+
#include "llvm/Support/MathExtras.h"
1416

1517
#define DEBUG_TYPE "longjmp"
1618

1719
using namespace llvm;
1820

1921
namespace opts {
22+
extern cl::OptionCategory BoltCategory;
2023
extern cl::OptionCategory BoltOptCategory;
2124
extern llvm::cl::opt<unsigned> AlignText;
2225
extern cl::opt<unsigned> AlignFunctions;
2326
extern cl::opt<bool> UseOldText;
2427
extern cl::opt<bool> HotFunctionsAtEnd;
2528

29+
static cl::opt<bool>
30+
CompactCodeModel("compact-code-model",
31+
cl::desc("generate code for binaries <128MB on AArch64"),
32+
cl::init(false), cl::cat(BoltCategory));
33+
2634
static cl::opt<bool> GroupStubs("group-stubs",
2735
cl::desc("share stubs across functions"),
2836
cl::init(true), cl::cat(BoltOptCategory));
@@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) {
6169
if (Next != E && (*Next)->isCold())
6270
return *I;
6371
}
64-
llvm_unreachable("No hot-colt split point found");
72+
llvm_unreachable("No hot-cold split point found");
6573
}
6674

67-
static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) {
75+
static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) {
6876
return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) &&
6977
!BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst);
7078
}
@@ -570,7 +578,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
570578
if (BC.MIB->isPseudo(Inst))
571579
continue;
572580

573-
if (!shouldInsertStub(BC, Inst)) {
581+
if (!mayNeedStub(BC, Inst)) {
574582
DotAddress += InsnSize;
575583
continue;
576584
}
@@ -634,7 +642,282 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
634642
return Error::success();
635643
}
636644

645+
void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
646+
BinaryContext &BC = BF.getBinaryContext();
647+
auto &MIB = BC.MIB;
648+
649+
// Quick path.
650+
if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan)
651+
return;
652+
653+
auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) {
654+
const unsigned Bits = MIB->getPCRelEncodingSize(Inst);
655+
return isIntN(Bits, Offset);
656+
};
657+
658+
auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress,
659+
const BinaryBasicBlock &BB) {
660+
const int64_t Offset = BB.getOutputStartAddress() - InstAddress;
661+
return isBranchOffsetInRange(Inst, Offset);
662+
};
663+
664+
// Keep track of *all* function trampolines that are going to be added to the
665+
// function layout at the end of relaxation.
666+
std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>>
667+
FunctionTrampolines;
668+
669+
// Function fragments are relaxed independently.
670+
for (FunctionFragment &FF : BF.getLayout().fragments()) {
671+
// Fill out code size estimation for the fragment. Use output BB address
672+
// ranges to store offsets from the start of the function fragment.
673+
uint64_t CodeSize = 0;
674+
for (BinaryBasicBlock *BB : FF) {
675+
BB->setOutputStartAddress(CodeSize);
676+
CodeSize += BB->estimateSize();
677+
BB->setOutputEndAddress(CodeSize);
678+
}
679+
680+
// Dynamically-updated size of the fragment.
681+
uint64_t FragmentSize = CodeSize;
682+
683+
// Size of the trampoline in bytes.
684+
constexpr uint64_t TrampolineSize = 4;
685+
686+
// Trampolines created for the fragment. DestinationBB -> TrampolineBB.
687+
// NB: here we store only the first trampoline created for DestinationBB.
688+
DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines;
689+
690+
// Create a trampoline code after \p BB or at the end of the fragment if BB
691+
// is nullptr. If \p UpdateOffsets is true, update FragmentSize and offsets
692+
// for basic blocks affected by the insertion of the trampoline.
693+
auto addTrampolineAfter = [&](BinaryBasicBlock *BB,
694+
BinaryBasicBlock *TargetBB, uint64_t Count,
695+
bool UpdateOffsets = true) {
696+
FunctionTrampolines.emplace_back(BB ? BB : FF.back(),
697+
BF.createBasicBlock());
698+
BinaryBasicBlock *TrampolineBB = FunctionTrampolines.back().second.get();
699+
700+
MCInst Inst;
701+
{
702+
auto L = BC.scopeLock();
703+
MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get());
704+
}
705+
TrampolineBB->addInstruction(Inst);
706+
TrampolineBB->addSuccessor(TargetBB, Count);
707+
TrampolineBB->setExecutionCount(Count);
708+
const uint64_t TrampolineAddress =
709+
BB ? BB->getOutputEndAddress() : FragmentSize;
710+
TrampolineBB->setOutputStartAddress(TrampolineAddress);
711+
TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize);
712+
TrampolineBB->setFragmentNum(FF.getFragmentNum());
713+
714+
if (!FragmentTrampolines.lookup(TargetBB))
715+
FragmentTrampolines[TargetBB] = TrampolineBB;
716+
717+
if (!UpdateOffsets)
718+
return TrampolineBB;
719+
720+
FragmentSize += TrampolineSize;
721+
722+
// If the trampoline was added at the end of the fragment, offsets of
723+
// other fragments should stay intact.
724+
if (!BB)
725+
return TrampolineBB;
726+
727+
// Update offsets for blocks after BB.
728+
for (BinaryBasicBlock *IBB : FF) {
729+
if (IBB->getOutputStartAddress() >= TrampolineAddress) {
730+
IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
731+
TrampolineSize);
732+
IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize);
733+
}
734+
}
735+
736+
// Update offsets for trampolines in this fragment that are placed after
737+
// the new trampoline. Note that trampoline blocks are not part of the
738+
// function/fragment layout until we add them right before the return
739+
// from relaxLocalBranches().
740+
for (auto &Pair : FunctionTrampolines) {
741+
BinaryBasicBlock *IBB = Pair.second.get();
742+
if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum())
743+
continue;
744+
if (IBB == TrampolineBB)
745+
continue;
746+
if (IBB->getOutputStartAddress() >= TrampolineAddress) {
747+
IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
748+
TrampolineSize);
749+
IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize);
750+
}
751+
}
752+
753+
return TrampolineBB;
754+
};
755+
756+
// Pre-populate trampolines by splitting unconditional branches from the
757+
// containing basic block.
758+
for (BinaryBasicBlock *BB : FF) {
759+
MCInst *Inst = BB->getLastNonPseudoInstr();
760+
if (!Inst || !MIB->isUnconditionalBranch(*Inst))
761+
continue;
762+
763+
const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst);
764+
BB->eraseInstruction(BB->findInstruction(Inst));
765+
BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize);
766+
767+
BinaryBasicBlock::BinaryBranchInfo BI;
768+
BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI);
769+
770+
BinaryBasicBlock *TrampolineBB =
771+
addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false);
772+
BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count);
773+
}
774+
775+
/// Relax the branch \p Inst in basic block \p BB that targets \p TargetBB.
776+
/// \p InstAddress contains offset of the branch from the start of the
777+
/// containing function fragment.
778+
auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst,
779+
uint64_t InstAddress, BinaryBasicBlock *TargetBB) {
780+
BinaryFunction *BF = BB->getParent();
781+
782+
// Use branch taken count for optimal relaxation.
783+
const uint64_t Count = BB->getBranchInfo(*TargetBB).Count;
784+
assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
785+
"Expected valid branch execution count");
786+
787+
// Try to reuse an existing trampoline without introducing any new code.
788+
BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB);
789+
if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) {
790+
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
791+
TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() +
792+
Count);
793+
auto L = BC.scopeLock();
794+
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
795+
return;
796+
}
797+
798+
// For cold branches, check if we can introduce a trampoline at the end
799+
// of the fragment that is within the branch reach. Note that such
800+
// trampoline may change address later and become unreachable in which
801+
// case we will need further relaxation.
802+
const int64_t OffsetToEnd = FragmentSize - InstAddress;
803+
if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) {
804+
TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count);
805+
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
806+
auto L = BC.scopeLock();
807+
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
808+
809+
return;
810+
}
811+
812+
// Insert a new block after the current one and use it as a trampoline.
813+
TrampolineBB = addTrampolineAfter(BB, TargetBB, Count);
814+
815+
// If the other successor is a fall-through, invert the condition code.
816+
const BinaryBasicBlock *const NextBB =
817+
BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false);
818+
if (BB->getConditionalSuccessor(false) == NextBB) {
819+
BB->swapConditionalSuccessors();
820+
auto L = BC.scopeLock();
821+
MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get());
822+
} else {
823+
auto L = BC.scopeLock();
824+
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
825+
}
826+
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
827+
};
828+
829+
bool MayNeedRelaxation;
830+
uint64_t NumIterations = 0;
831+
do {
832+
MayNeedRelaxation = false;
833+
++NumIterations;
834+
for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) {
835+
BinaryBasicBlock *BB = *BBI;
836+
uint64_t NextInstOffset = BB->getOutputStartAddress();
837+
for (MCInst &Inst : *BB) {
838+
const size_t InstAddress = NextInstOffset;
839+
if (!MIB->isPseudo(Inst))
840+
NextInstOffset += 4;
841+
842+
if (!mayNeedStub(BF.getBinaryContext(), Inst))
843+
continue;
844+
845+
const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst);
846+
847+
// Span of +/-128MB.
848+
if (BitsAvailable == LongestJumpBits)
849+
continue;
850+
851+
const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst);
852+
BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol);
853+
assert(TargetBB &&
854+
"Basic block target expected for conditional branch.");
855+
856+
// Check if the relaxation is needed.
857+
if (TargetBB->getFragmentNum() == FF.getFragmentNum() &&
858+
isBlockInRange(Inst, InstAddress, *TargetBB))
859+
continue;
860+
861+
relaxBranch(BB, Inst, InstAddress, TargetBB);
862+
863+
MayNeedRelaxation = true;
864+
}
865+
}
866+
867+
// We may have added new instructions, but the whole fragment is less than
868+
// the minimum branch span.
869+
if (FragmentSize < ShortestJumpSpan)
870+
MayNeedRelaxation = false;
871+
872+
} while (MayNeedRelaxation);
873+
874+
LLVM_DEBUG({
875+
if (NumIterations > 2) {
876+
dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get()
877+
<< " of " << BF << " in " << NumIterations << " iterations\n";
878+
}
879+
});
880+
}
881+
882+
// Add trampoline blocks from all fragments to the layout.
883+
DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>>
884+
Insertions;
885+
for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair :
886+
FunctionTrampolines) {
887+
if (!Pair.second)
888+
continue;
889+
Insertions[Pair.first].emplace_back(std::move(Pair.second));
890+
}
891+
892+
for (auto &Pair : Insertions) {
893+
BF.insertBasicBlocks(Pair.first, std::move(Pair.second),
894+
/*UpdateLayout*/ true, /*UpdateCFI*/ true,
895+
/*RecomputeLPs*/ false);
896+
}
897+
}
898+
637899
Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
900+
901+
if (opts::CompactCodeModel) {
902+
BC.outs()
903+
<< "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";
904+
905+
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
906+
relaxLocalBranches(BF);
907+
};
908+
909+
ParallelUtilities::PredicateTy SkipPredicate =
910+
[&](const BinaryFunction &BF) {
911+
return !BC.shouldEmit(BF) || !BF.isSimple();
912+
};
913+
914+
ParallelUtilities::runOnEachFunction(
915+
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
916+
SkipPredicate, "RelaxLocalBranches");
917+
918+
return Error::success();
919+
}
920+
638921
BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
639922
std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
640923
bool Modified;

0 commit comments

Comments
 (0)