|
11 | 11 | //===----------------------------------------------------------------------===//
|
12 | 12 |
|
13 | 13 | #include "bolt/Passes/LongJmp.h"
|
| 14 | +#include "bolt/Core/ParallelUtilities.h" |
| 15 | +#include "llvm/Support/MathExtras.h" |
14 | 16 |
|
15 | 17 | #define DEBUG_TYPE "longjmp"
|
16 | 18 |
|
17 | 19 | using namespace llvm;
|
18 | 20 |
|
19 | 21 | namespace opts {
|
| 22 | +extern cl::OptionCategory BoltCategory; |
20 | 23 | extern cl::OptionCategory BoltOptCategory;
|
21 | 24 | extern llvm::cl::opt<unsigned> AlignText;
|
22 | 25 | extern cl::opt<unsigned> AlignFunctions;
|
23 | 26 | extern cl::opt<bool> UseOldText;
|
24 | 27 | extern cl::opt<bool> HotFunctionsAtEnd;
|
25 | 28 |
|
| 29 | +static cl::opt<bool> |
| 30 | + CompactCodeModel("compact-code-model", |
| 31 | + cl::desc("generate code for binaries <128MB on AArch64"), |
| 32 | + cl::init(false), cl::cat(BoltCategory)); |
| 33 | + |
26 | 34 | static cl::opt<bool> GroupStubs("group-stubs",
|
27 | 35 | cl::desc("share stubs across functions"),
|
28 | 36 | cl::init(true), cl::cat(BoltOptCategory));
|
@@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) {
|
61 | 69 | if (Next != E && (*Next)->isCold())
|
62 | 70 | return *I;
|
63 | 71 | }
|
64 |
| - llvm_unreachable("No hot-colt split point found"); |
| 72 | + llvm_unreachable("No hot-cold split point found"); |
65 | 73 | }
|
66 | 74 |
|
67 |
| -static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) { |
| 75 | +static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) { |
68 | 76 | return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) &&
|
69 | 77 | !BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst);
|
70 | 78 | }
|
@@ -570,7 +578,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
|
570 | 578 | if (BC.MIB->isPseudo(Inst))
|
571 | 579 | continue;
|
572 | 580 |
|
573 |
| - if (!shouldInsertStub(BC, Inst)) { |
| 581 | + if (!mayNeedStub(BC, Inst)) { |
574 | 582 | DotAddress += InsnSize;
|
575 | 583 | continue;
|
576 | 584 | }
|
@@ -634,7 +642,282 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
|
634 | 642 | return Error::success();
|
635 | 643 | }
|
636 | 644 |
|
| 645 | +void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { |
| 646 | + BinaryContext &BC = BF.getBinaryContext(); |
| 647 | + auto &MIB = BC.MIB; |
| 648 | + |
| 649 | + // Quick path. |
| 650 | + if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan) |
| 651 | + return; |
| 652 | + |
| 653 | + auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) { |
| 654 | + const unsigned Bits = MIB->getPCRelEncodingSize(Inst); |
| 655 | + return isIntN(Bits, Offset); |
| 656 | + }; |
| 657 | + |
| 658 | + auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress, |
| 659 | + const BinaryBasicBlock &BB) { |
| 660 | + const int64_t Offset = BB.getOutputStartAddress() - InstAddress; |
| 661 | + return isBranchOffsetInRange(Inst, Offset); |
| 662 | + }; |
| 663 | + |
| 664 | + // Keep track of *all* function trampolines that are going to be added to the |
| 665 | + // function layout at the end of relaxation. |
| 666 | + std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>> |
| 667 | + FunctionTrampolines; |
| 668 | + |
| 669 | + // Function fragments are relaxed independently. |
| 670 | + for (FunctionFragment &FF : BF.getLayout().fragments()) { |
| 671 | + // Fill out code size estimation for the fragment. Use output BB address |
| 672 | + // ranges to store offsets from the start of the function fragment. |
| 673 | + uint64_t CodeSize = 0; |
| 674 | + for (BinaryBasicBlock *BB : FF) { |
| 675 | + BB->setOutputStartAddress(CodeSize); |
| 676 | + CodeSize += BB->estimateSize(); |
| 677 | + BB->setOutputEndAddress(CodeSize); |
| 678 | + } |
| 679 | + |
| 680 | + // Dynamically-updated size of the fragment. |
| 681 | + uint64_t FragmentSize = CodeSize; |
| 682 | + |
| 683 | + // Size of the trampoline in bytes. |
| 684 | + constexpr uint64_t TrampolineSize = 4; |
| 685 | + |
| 686 | + // Trampolines created for the fragment. DestinationBB -> TrampolineBB. |
| 687 | + // NB: here we store only the first trampoline created for DestinationBB. |
| 688 | + DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines; |
| 689 | + |
| 690 | + // Create a trampoline code after \p BB or at the end of the fragment if BB |
| 691 | + // is nullptr. If \p UpdateOffsets is true, update FragmentSize and offsets |
| 692 | + // for basic blocks affected by the insertion of the trampoline. |
| 693 | + auto addTrampolineAfter = [&](BinaryBasicBlock *BB, |
| 694 | + BinaryBasicBlock *TargetBB, uint64_t Count, |
| 695 | + bool UpdateOffsets = true) { |
| 696 | + FunctionTrampolines.emplace_back(BB ? BB : FF.back(), |
| 697 | + BF.createBasicBlock()); |
| 698 | + BinaryBasicBlock *TrampolineBB = FunctionTrampolines.back().second.get(); |
| 699 | + |
| 700 | + MCInst Inst; |
| 701 | + { |
| 702 | + auto L = BC.scopeLock(); |
| 703 | + MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get()); |
| 704 | + } |
| 705 | + TrampolineBB->addInstruction(Inst); |
| 706 | + TrampolineBB->addSuccessor(TargetBB, Count); |
| 707 | + TrampolineBB->setExecutionCount(Count); |
| 708 | + const uint64_t TrampolineAddress = |
| 709 | + BB ? BB->getOutputEndAddress() : FragmentSize; |
| 710 | + TrampolineBB->setOutputStartAddress(TrampolineAddress); |
| 711 | + TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize); |
| 712 | + TrampolineBB->setFragmentNum(FF.getFragmentNum()); |
| 713 | + |
| 714 | + if (!FragmentTrampolines.lookup(TargetBB)) |
| 715 | + FragmentTrampolines[TargetBB] = TrampolineBB; |
| 716 | + |
| 717 | + if (!UpdateOffsets) |
| 718 | + return TrampolineBB; |
| 719 | + |
| 720 | + FragmentSize += TrampolineSize; |
| 721 | + |
| 722 | + // If the trampoline was added at the end of the fragment, offsets of |
| 723 | + // other fragments should stay intact. |
| 724 | + if (!BB) |
| 725 | + return TrampolineBB; |
| 726 | + |
| 727 | + // Update offsets for blocks after BB. |
| 728 | + for (BinaryBasicBlock *IBB : FF) { |
| 729 | + if (IBB->getOutputStartAddress() >= TrampolineAddress) { |
| 730 | + IBB->setOutputStartAddress(IBB->getOutputStartAddress() + |
| 731 | + TrampolineSize); |
| 732 | + IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize); |
| 733 | + } |
| 734 | + } |
| 735 | + |
| 736 | + // Update offsets for trampolines in this fragment that are placed after |
| 737 | + // the new trampoline. Note that trampoline blocks are not part of the |
| 738 | + // function/fragment layout until we add them right before the return |
| 739 | + // from relaxLocalBranches(). |
| 740 | + for (auto &Pair : FunctionTrampolines) { |
| 741 | + BinaryBasicBlock *IBB = Pair.second.get(); |
| 742 | + if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum()) |
| 743 | + continue; |
| 744 | + if (IBB == TrampolineBB) |
| 745 | + continue; |
| 746 | + if (IBB->getOutputStartAddress() >= TrampolineAddress) { |
| 747 | + IBB->setOutputStartAddress(IBB->getOutputStartAddress() + |
| 748 | + TrampolineSize); |
| 749 | + IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize); |
| 750 | + } |
| 751 | + } |
| 752 | + |
| 753 | + return TrampolineBB; |
| 754 | + }; |
| 755 | + |
| 756 | + // Pre-populate trampolines by splitting unconditional branches from the |
| 757 | + // containing basic block. |
| 758 | + for (BinaryBasicBlock *BB : FF) { |
| 759 | + MCInst *Inst = BB->getLastNonPseudoInstr(); |
| 760 | + if (!Inst || !MIB->isUnconditionalBranch(*Inst)) |
| 761 | + continue; |
| 762 | + |
| 763 | + const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst); |
| 764 | + BB->eraseInstruction(BB->findInstruction(Inst)); |
| 765 | + BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize); |
| 766 | + |
| 767 | + BinaryBasicBlock::BinaryBranchInfo BI; |
| 768 | + BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI); |
| 769 | + |
| 770 | + BinaryBasicBlock *TrampolineBB = |
| 771 | + addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false); |
| 772 | + BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count); |
| 773 | + } |
| 774 | + |
| 775 | + /// Relax the branch \p Inst in basic block \p BB that targets \p TargetBB. |
| 776 | + /// \p InstAddress contains offset of the branch from the start of the |
| 777 | + /// containing function fragment. |
| 778 | + auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst, |
| 779 | + uint64_t InstAddress, BinaryBasicBlock *TargetBB) { |
| 780 | + BinaryFunction *BF = BB->getParent(); |
| 781 | + |
| 782 | + // Use branch taken count for optimal relaxation. |
| 783 | + const uint64_t Count = BB->getBranchInfo(*TargetBB).Count; |
| 784 | + assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE && |
| 785 | + "Expected valid branch execution count"); |
| 786 | + |
| 787 | + // Try to reuse an existing trampoline without introducing any new code. |
| 788 | + BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB); |
| 789 | + if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) { |
| 790 | + BB->replaceSuccessor(TargetBB, TrampolineBB, Count); |
| 791 | + TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() + |
| 792 | + Count); |
| 793 | + auto L = BC.scopeLock(); |
| 794 | + MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); |
| 795 | + return; |
| 796 | + } |
| 797 | + |
| 798 | + // For cold branches, check if we can introduce a trampoline at the end |
| 799 | + // of the fragment that is within the branch reach. Note that such |
| 800 | + // trampoline may change address later and become unreachable in which |
| 801 | + // case we will need further relaxation. |
| 802 | + const int64_t OffsetToEnd = FragmentSize - InstAddress; |
| 803 | + if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) { |
| 804 | + TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count); |
| 805 | + BB->replaceSuccessor(TargetBB, TrampolineBB, Count); |
| 806 | + auto L = BC.scopeLock(); |
| 807 | + MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); |
| 808 | + |
| 809 | + return; |
| 810 | + } |
| 811 | + |
| 812 | + // Insert a new block after the current one and use it as a trampoline. |
| 813 | + TrampolineBB = addTrampolineAfter(BB, TargetBB, Count); |
| 814 | + |
| 815 | + // If the other successor is a fall-through, invert the condition code. |
| 816 | + const BinaryBasicBlock *const NextBB = |
| 817 | + BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false); |
| 818 | + if (BB->getConditionalSuccessor(false) == NextBB) { |
| 819 | + BB->swapConditionalSuccessors(); |
| 820 | + auto L = BC.scopeLock(); |
| 821 | + MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get()); |
| 822 | + } else { |
| 823 | + auto L = BC.scopeLock(); |
| 824 | + MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); |
| 825 | + } |
| 826 | + BB->replaceSuccessor(TargetBB, TrampolineBB, Count); |
| 827 | + }; |
| 828 | + |
| 829 | + bool MayNeedRelaxation; |
| 830 | + uint64_t NumIterations = 0; |
| 831 | + do { |
| 832 | + MayNeedRelaxation = false; |
| 833 | + ++NumIterations; |
| 834 | + for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) { |
| 835 | + BinaryBasicBlock *BB = *BBI; |
| 836 | + uint64_t NextInstOffset = BB->getOutputStartAddress(); |
| 837 | + for (MCInst &Inst : *BB) { |
| 838 | + const size_t InstAddress = NextInstOffset; |
| 839 | + if (!MIB->isPseudo(Inst)) |
| 840 | + NextInstOffset += 4; |
| 841 | + |
| 842 | + if (!mayNeedStub(BF.getBinaryContext(), Inst)) |
| 843 | + continue; |
| 844 | + |
| 845 | + const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst); |
| 846 | + |
| 847 | + // Span of +/-128MB. |
| 848 | + if (BitsAvailable == LongestJumpBits) |
| 849 | + continue; |
| 850 | + |
| 851 | + const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst); |
| 852 | + BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol); |
| 853 | + assert(TargetBB && |
| 854 | + "Basic block target expected for conditional branch."); |
| 855 | + |
| 856 | + // Check if the relaxation is needed. |
| 857 | + if (TargetBB->getFragmentNum() == FF.getFragmentNum() && |
| 858 | + isBlockInRange(Inst, InstAddress, *TargetBB)) |
| 859 | + continue; |
| 860 | + |
| 861 | + relaxBranch(BB, Inst, InstAddress, TargetBB); |
| 862 | + |
| 863 | + MayNeedRelaxation = true; |
| 864 | + } |
| 865 | + } |
| 866 | + |
| 867 | + // We may have added new instructions, but the whole fragment is less than |
| 868 | + // the minimum branch span. |
| 869 | + if (FragmentSize < ShortestJumpSpan) |
| 870 | + MayNeedRelaxation = false; |
| 871 | + |
| 872 | + } while (MayNeedRelaxation); |
| 873 | + |
| 874 | + LLVM_DEBUG({ |
| 875 | + if (NumIterations > 2) { |
| 876 | + dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get() |
| 877 | + << " of " << BF << " in " << NumIterations << " iterations\n"; |
| 878 | + } |
| 879 | + }); |
| 880 | + } |
| 881 | + |
| 882 | + // Add trampoline blocks from all fragments to the layout. |
| 883 | + DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>> |
| 884 | + Insertions; |
| 885 | + for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair : |
| 886 | + FunctionTrampolines) { |
| 887 | + if (!Pair.second) |
| 888 | + continue; |
| 889 | + Insertions[Pair.first].emplace_back(std::move(Pair.second)); |
| 890 | + } |
| 891 | + |
| 892 | + for (auto &Pair : Insertions) { |
| 893 | + BF.insertBasicBlocks(Pair.first, std::move(Pair.second), |
| 894 | + /*UpdateLayout*/ true, /*UpdateCFI*/ true, |
| 895 | + /*RecomputeLPs*/ false); |
| 896 | + } |
| 897 | +} |
| 898 | + |
637 | 899 | Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
|
| 900 | + |
| 901 | + if (opts::CompactCodeModel) { |
| 902 | + BC.outs() |
| 903 | + << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n"; |
| 904 | + |
| 905 | + ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| 906 | + relaxLocalBranches(BF); |
| 907 | + }; |
| 908 | + |
| 909 | + ParallelUtilities::PredicateTy SkipPredicate = |
| 910 | + [&](const BinaryFunction &BF) { |
| 911 | + return !BC.shouldEmit(BF) || !BF.isSimple(); |
| 912 | + }; |
| 913 | + |
| 914 | + ParallelUtilities::runOnEachFunction( |
| 915 | + BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, |
| 916 | + SkipPredicate, "RelaxLocalBranches"); |
| 917 | + |
| 918 | + return Error::success(); |
| 919 | + } |
| 920 | + |
638 | 921 | BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
|
639 | 922 | std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
|
640 | 923 | bool Modified;
|
|
0 commit comments