Skip to content

Reapply "[lld] Support thumb PLTs" (#93631) #93644

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 123 additions & 53 deletions lld/ELF/Arch/ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,36 +231,71 @@ static void writePltHeaderLong(uint8_t *buf) {
// The default PLT header requires the .got.plt to be within 128 Mb of the
// .plt in the positive direction.
void ARM::writePltHeader(uint8_t *buf) const {
// Use a similar sequence to that in writePlt(), the difference is the calling
// conventions mean we use lr instead of ip. The PLT entry is responsible for
// saving lr on the stack, the dynamic loader is responsible for reloading
// it.
const uint32_t pltData[] = {
0xe52de004, // L1: str lr, [sp,#-4]!
0xe28fe600, // add lr, pc, #0x0NN00000 &(.got.plt - L1 - 4)
0xe28eea00, // add lr, lr, #0x000NN000 &(.got.plt - L1 - 4)
0xe5bef000, // ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
};

uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
if (!llvm::isUInt<27>(offset)) {
// We cannot encode the Offset, use the long form.
writePltHeaderLong(buf);
return;
if (config->armThumbPLTs) {
// The instruction sequence for thumb:
//
// 0: b500 push {lr}
// 2: f8df e008 ldr.w lr, [pc, #0x8] @ 0xe <func+0xe>
// 6: 44fe add lr, pc
// 8: f85e ff08 ldr pc, [lr, #8]!
// e: .word .got.plt - .plt - 16
//
// At 0x8, we want to jump to .got.plt, the -16 accounts for 8 bytes from
// `pc` in the add instruction and 8 bytes for the `lr` adjustment.
//
uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 16;
assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
write16(buf + 0, 0xb500);
// Split into two halves to support endianness correctly.
write16(buf + 2, 0xf8df);
write16(buf + 4, 0xe008);
write16(buf + 6, 0x44fe);
// Split into two halves to support endianness correctly.
write16(buf + 8, 0xf85e);
write16(buf + 10, 0xff08);
write32(buf + 12, offset);

memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
memcpy(buf + 20, trapInstr.data(), 4);
memcpy(buf + 24, trapInstr.data(), 4);
memcpy(buf + 28, trapInstr.data(), 4);
} else {
// Use a similar sequence to that in writePlt(), the difference is the
// calling conventions mean we use lr instead of ip. The PLT entry is
// responsible for saving lr on the stack, the dynamic loader is responsible
// for reloading it.
const uint32_t pltData[] = {
0xe52de004, // L1: str lr, [sp,#-4]!
0xe28fe600, // add lr, pc, #0x0NN00000 &(.got.plt - L1 - 4)
0xe28eea00, // add lr, lr, #0x000NN000 &(.got.plt - L1 - 4)
0xe5bef000, // ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
};

uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
if (!llvm::isUInt<27>(offset)) {
// We cannot encode the Offset, use the long form.
writePltHeaderLong(buf);
return;
}
write32(buf + 0, pltData[0]);
write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
write32(buf + 12, pltData[3] | (offset & 0xfff));
memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
memcpy(buf + 20, trapInstr.data(), 4);
memcpy(buf + 24, trapInstr.data(), 4);
memcpy(buf + 28, trapInstr.data(), 4);
}
write32(buf + 0, pltData[0]);
write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
write32(buf + 12, pltData[3] | (offset & 0xfff));
memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
memcpy(buf + 20, trapInstr.data(), 4);
memcpy(buf + 24, trapInstr.data(), 4);
memcpy(buf + 28, trapInstr.data(), 4);
}

void ARM::addPltHeaderSymbols(InputSection &isec) const {
addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
if (config->armThumbPLTs) {
addSyntheticLocal("$t", STT_NOTYPE, 0, 0, isec);
addSyntheticLocal("$d", STT_NOTYPE, 12, 0, isec);
} else {
addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
}
}

// Long form PLT entries that do not have any restrictions on the displacement
Expand All @@ -279,32 +314,65 @@ static void writePltLong(uint8_t *buf, uint64_t gotPltEntryAddr,
// .plt in the positive direction.
void ARM::writePlt(uint8_t *buf, const Symbol &sym,
uint64_t pltEntryAddr) const {
// The PLT entry is similar to the example given in Appendix A of ELF for
// the Arm Architecture. Instead of using the Group Relocations to find the
// optimal rotation for the 8-bit immediate used in the add instructions we
// hard code the most compact rotations for simplicity. This saves a load
// instruction over the long plt sequences.
const uint32_t pltData[] = {
0xe28fc600, // L1: add ip, pc, #0x0NN00000 Offset(&(.got.plt) - L1 - 8
0xe28cca00, // add ip, ip, #0x000NN000 Offset(&(.got.plt) - L1 - 8
0xe5bcf000, // ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
};

uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
if (!llvm::isUInt<27>(offset)) {
// We cannot encode the Offset, use the long form.
writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
return;
if (!config->armThumbPLTs) {
uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;

// The PLT entry is similar to the example given in Appendix A of ELF for
// the Arm Architecture. Instead of using the Group Relocations to find the
// optimal rotation for the 8-bit immediate used in the add instructions we
// hard code the most compact rotations for simplicity. This saves a load
// instruction over the long plt sequences.
const uint32_t pltData[] = {
0xe28fc600, // L1: add ip, pc, #0x0NN00000 Offset(&(.got.plt) - L1 - 8
0xe28cca00, // add ip, ip, #0x000NN000 Offset(&(.got.plt) - L1 - 8
0xe5bcf000, // ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
};
if (!llvm::isUInt<27>(offset)) {
// We cannot encode the Offset, use the long form.
writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
return;
}
write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
write32(buf + 8, pltData[2] | (offset & 0xfff));
memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
} else {
uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 12;
assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");

// A PLT entry will be:
//
// movw ip, #<lower 16 bits>
// movt ip, #<upper 16 bits>
// add ip, pc
// L1: ldr.w pc, [ip]
// b L1
//
// where ip = r12 = 0xc

// movw ip, #<lower 16 bits>
write16(buf + 2, 0x0c00); // use `ip`
relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, offset);

// movt ip, #<upper 16 bits>
write16(buf + 6, 0x0c00); // use `ip`
relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, offset);

write16(buf + 8, 0x44fc); // add ip, pc
write16(buf + 10, 0xf8dc); // ldr.w pc, [ip] (bottom half)
write16(buf + 12, 0xf000); // ldr.w pc, [ip] (upper half)
write16(buf + 14, 0xe7fc); // Branch to previous instruction
}
write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
write32(buf + 8, pltData[2] | (offset & 0xfff));
memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
}

void ARM::addPltSymbols(InputSection &isec, uint64_t off) const {
addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
if (config->armThumbPLTs) {
addSyntheticLocal("$t", STT_NOTYPE, off, 0, isec);
} else {
addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
}
}

bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
Expand All @@ -325,6 +393,8 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
case R_ARM_JUMP24:
// Source is ARM, all PLT entries are ARM so no interworking required.
// Otherwise we need to interwork if STT_FUNC Symbol has bit 0 set (Thumb).
assert(!config->armThumbPLTs &&
"If the source is ARM, we should not need Thumb PLTs");
if (s.isFunc() && expr == R_PC && (s.getVA() & 1))
return true;
[[fallthrough]];
Expand All @@ -335,9 +405,9 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
}
case R_ARM_THM_JUMP19:
case R_ARM_THM_JUMP24:
// Source is Thumb, all PLT entries are ARM so interworking is required.
// Source is Thumb, when all PLT entries are ARM interworking is required.
// Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM).
if (expr == R_PLT_PC || (s.isFunc() && (s.getVA() & 1) == 0))
if ((expr == R_PLT_PC && !config->armThumbPLTs) || (s.isFunc() && (s.getVA() & 1) == 0))
return true;
[[fallthrough]];
case R_ARM_THM_CALL: {
Expand Down Expand Up @@ -547,7 +617,6 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
// STT_FUNC we choose whether to write a BL or BLX depending on the
// value of bit 0 of Val. With bit 0 == 1 denoting Thumb. If the symbol is
// not of type STT_FUNC then we must preserve the original instruction.
// PLT entries are always ARM state so we know we don't need to interwork.
assert(rel.sym); // R_ARM_CALL is always reached via relocate().
bool bit0Thumb = val & 1;
bool isBlx = (read32(loc) & 0xfe000000) == 0xfa000000;
Expand Down Expand Up @@ -606,12 +675,13 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
// PLT entries are always ARM state so we know we need to interwork.
assert(rel.sym); // R_ARM_THM_CALL is always reached via relocate().
bool bit0Thumb = val & 1;
bool useThumb = bit0Thumb || config->armThumbPLTs;
bool isBlx = (read16(loc + 2) & 0x1000) == 0;
// lld 10.0 and before always used bit0Thumb when deciding to write a BLX
// even when type not STT_FUNC. PLT entries generated by LLD are always ARM.
if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == bit0Thumb)
// even when type not STT_FUNC.
if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb)
stateChangeWarning(loc, rel.type, *rel.sym);
if (rel.sym->isFunc() || rel.sym->isInPlt() ? !bit0Thumb : isBlx) {
if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) {
// We are writing a BLX. Ensure BLX destination is 4-byte aligned. As
// the BLX instruction may only be two byte aligned. This must be done
// before overflow check.
Expand Down
1 change: 1 addition & 0 deletions lld/ELF/Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ struct Config {
bool allowMultipleDefinition;
bool fatLTOObjects;
bool androidPackDynRelocs = false;
bool armThumbPLTs = false;
bool armHasBlx = false;
bool armHasMovtMovw = false;
bool armJ1J2BranchEncoding = false;
Expand Down
12 changes: 12 additions & 0 deletions lld/ELF/InputFiles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,18 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) {
if (arch >= ARMBuildAttrs::CPUArch::v8_M_Base &&
profile == ARMBuildAttrs::MicroControllerProfile)
config->armCMSESupport = true;

// The thumb PLT entries require Thumb2 which can be used on multiple archs.
// For now, let's limit it to ones where ARM isn't available and we know have
// Thumb2.
std::optional<unsigned> armISA =
attributes.getAttributeValue(ARMBuildAttrs::ARM_ISA_use);
std::optional<unsigned> thumb =
attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use);
bool noArmISA = !armISA || *armISA == ARMBuildAttrs::Not_Allowed;
bool hasThumb2 = thumb && *thumb >= ARMBuildAttrs::AllowThumb32;
if (noArmISA && hasThumb2)
config->armThumbPLTs = true;
}

InputFile::InputFile(Kind k, MemoryBufferRef m)
Expand Down
126 changes: 126 additions & 0 deletions lld/test/ELF/armv8-thumb-plt-reloc.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// REQUIRES: arm
// RUN: llvm-mc -filetype=obj -arm-add-build-attributes -triple=thumbv8-none-linux-gnueabi --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.o
// RUN: llvm-mc -filetype=obj -arm-add-build-attributes -triple=thumbv8-none-linux-gnueabi --arch=thumb --mcpu=cortex-m33 %s -o %t2.o
// RUN: ld.lld %t1.o %t2.o -o %t
// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s
// RUN: ld.lld -shared %t1.o %t2.o -o %t.so
// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s
// RUN: llvm-readelf -S -r %t.so | FileCheck -check-prefix=DSOREL %s

// RUN: llvm-mc -filetype=obj -arm-add-build-attributes -triple=thumbv8-none-linux-gnueabi --arch=thumbeb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.be.o
// RUN: llvm-mc -filetype=obj -arm-add-build-attributes -triple=thumbv8-none-linux-gnueabi --arch=thumbeb --mcpu=cortex-m33 %s -o %t2.be.o
// RUN: ld.lld %t1.be.o %t2.be.o -o %t.be
// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
// RUN: ld.lld -shared %t1.be.o %t2.be.o -o %t.so.be
// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s

// RUN: ld.lld --be8 %t1.be.o %t2.be.o -o %t.be
// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
// RUN: ld.lld --be8 -shared %t1.be.o %t2.be.o -o %t.so.be
// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s

/// Test PLT entry generation
.text
.align 2
.globl _start
.type _start,%function
_start:
bl func1
bl func2
bl func3
b.w func1
b.w func2
b.w func3
beq.w func1
beq.w func2
beq.w func3

/// Executable, expect no PLT
// CHECK: Disassembly of section .text:
// CHECK-EMPTY:
// CHECK-NEXT: <func1>:
// CHECK-NEXT: bx lr
// CHECK: <func2>:
// CHECK-NEXT: bx lr
// CHECK: <func3>:
// CHECK-NEXT: bx lr
// CHECK-NEXT: d4d4
// CHECK: <_start>:
// CHECK-NEXT: bl {{.*}} <func1>
// CHECK-NEXT: bl {{.*}} <func2>
// CHECK-NEXT: bl {{.*}} <func3>
// CHECK-NEXT: b.w {{.*}} <func1>
// CHECK-NEXT: b.w {{.*}} <func2>
// CHECK-NEXT: b.w {{.*}} <func3>
// CHECK-NEXT: beq.w {{.*}} <func1>
// CHECK-NEXT: beq.w {{.*}} <func2>
// CHECK-NEXT: beq.w {{.*}} <func3>

// DSO: Disassembly of section .text:
// DSO-EMPTY:
// DSO-NEXT: <func1>:
// DSO-NEXT: bx lr
// DSO: <func2>:
// DSO-NEXT: bx lr
// DSO: <func3>:
// DSO-NEXT: bx lr
// DSO-NEXT: d4d4
// DSO: <_start>:
/// 0x10260 = PLT func1
// DSO-NEXT: bl 0x10260
/// 0x10270 = PLT func2
// DSO-NEXT: bl 0x10270
/// 0x10280 = PLT func3
// DSO-NEXT: bl 0x10280
/// 0x10260 = PLT func1
// DSO-NEXT: b.w 0x10260
/// 0x10270 = PLT func2
// DSO-NEXT: b.w 0x10270
/// 0x10280 = PLT func3
// DSO-NEXT: b.w 0x10280
/// 0x10260 = PLT func1
// DSO-NEXT: beq.w 0x10260
/// 0x10270 = PLT func2
// DSO-NEXT: beq.w 0x10270
/// 0x10280 = PLT func3
// DSO-NEXT: beq.w 0x10280
// DSO: Disassembly of section .plt:
// DSO-EMPTY:
// DSO-NEXT: 10240 <.plt>:
// DSO-NEXT: push {lr}
// DSO-NEXT: ldr.w lr, [pc, #8]
// DSO-NEXT: add lr, pc
// DSO-NEXT: ldr pc, [lr, #8]!
/// 0x20098 = .got.plt (0x302D8) - pc (0x10238 = .plt + 8) - 8
// DSO-NEXT: .word 0x00020098
// DSO-NEXT: .word 0xd4d4d4d4
// DSO-NEXT: .word 0xd4d4d4d4
// DSO-NEXT: .word 0xd4d4d4d4
// DSO-NEXT: .word 0xd4d4d4d4

/// 136 + 2 << 16 + 0x1026c = 0x302f4 = got entry 1
// DSO-NEXT: 10260: f240 0c88 movw r12, #136
// DSO-NEXT: f2c0 0c02 movt r12, #2
// DSO-NEXT: 44fc add r12, pc
// DSO-NEXT: f8dc f000 ldr.w pc, [r12]
// DSO-NEXT: e7fc b 0x1026a
/// 124 + 2 << 16 + 0x1027c = 0x302f8 = got entry 2
// DSO-NEXT: 10270: f240 0c7c movw r12, #124
// DSO-NEXT: f2c0 0c02 movt r12, #2
// DSO-NEXT: 44fc add r12, pc
// DSO-NEXT: f8dc f000 ldr.w pc, [r12]
// DSO-NEXT: e7fc b 0x1027a
/// 112 + 2 << 16 + 0x1028c = 0x302fc = got entry 3
// DSO-NEXT: 10280: f240 0c70 movw r12, #112
// DSO-NEXT: f2c0 0c02 movt r12, #2
// DSO-NEXT: 44fc add r12, pc
// DSO-NEXT: f8dc f000 ldr.w pc, [r12]
// DSO-NEXT: e7fc b 0x1028a

// DSOREL: .got.plt PROGBITS 000302e8 {{.*}} 000018 00 WA 0 0 4
// DSOREL: Relocation section '.rel.plt'
// DSOREL: 000302f4 {{.*}} R_ARM_JUMP_SLOT {{.*}} func1
// DSOREL: 000302f8 {{.*}} R_ARM_JUMP_SLOT {{.*}} func2
// DSOREL: 000302fc {{.*}} R_ARM_JUMP_SLOT {{.*}} func3
Loading