Skip to content

[lld][LoongArch] Relax call36/tail36: R_LARCH_CALL36 #123576

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Mar 9, 2025
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 155 additions & 2 deletions lld/ELF/Arch/LoongArch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,13 @@ enum Op {
ADDI_W = 0x02800000,
ADDI_D = 0x02c00000,
ANDI = 0x03400000,
PCADDI = 0x18000000,
PCADDU12I = 0x1c000000,
LD_W = 0x28800000,
LD_D = 0x28c00000,
JIRL = 0x4c000000,
B = 0x50000000,
BL = 0x54000000,
};

enum Reg {
Expand Down Expand Up @@ -131,6 +134,10 @@ static uint32_t extractBits(uint64_t v, uint32_t begin, uint32_t end) {
return begin == 63 ? v >> end : (v & ((1ULL << (begin + 1)) - 1)) >> end;
}

static uint32_t getD5(uint64_t v) { return extractBits(v, 4, 0); }

static uint32_t getJ5(uint64_t v) { return extractBits(v, 9, 5); }

static uint32_t setD5k16(uint32_t insn, uint32_t imm) {
uint32_t immLo = extractBits(imm, 15, 0);
uint32_t immHi = extractBits(imm, 20, 16);
Expand Down Expand Up @@ -743,6 +750,119 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
}
}

static bool relaxable(ArrayRef<Relocation> relocs, size_t i) {
return i + 1 < relocs.size() && relocs[i + 1].type == R_LARCH_RELAX;
}

static bool isPairRelaxable(ArrayRef<Relocation> relocs, size_t i) {
return relaxable(relocs, i) && relaxable(relocs, i + 2) &&
relocs[i].offset + 4 == relocs[i + 2].offset;
}

// Relax code sequence.
// From:
// pcalau12i $a0, %pc_hi20(sym)
// addi.w/d $a0, $a0, %pc_lo12(sym)
// To:
// pcaddi $a0, %pc_lo12(sym)
//
// From:
// pcalau12i $a0, %got_pc_hi20(sym_got)
// ld.w/d $a0, $a0, %got_pc_lo12(sym_got)
// To:
// pcaddi $a0, %got_pc_hi20(sym_got)
static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i,
uint64_t loc, Relocation &rHi20, Relocation &rLo12,
uint32_t &remove) {
// check if the relocations are relaxable sequences.
if (!((rHi20.type == R_LARCH_PCALA_HI20 &&
rLo12.type == R_LARCH_PCALA_LO12) ||
(rHi20.type == R_LARCH_GOT_PC_HI20 &&
rLo12.type == R_LARCH_GOT_PC_LO12)))
return;

// GOT references to absolute symbols can't be relaxed to use pcaddi in
// position-independent code, because these instructions produce a relative
// address.
// Meanwhile skip undefined, preemptible and STT_GNU_IFUNC symbols, because
// these symbols may be resolve in runtime.
if (rHi20.type == R_LARCH_GOT_PC_HI20 &&
(!rHi20.sym->isDefined() || rHi20.sym->isPreemptible ||
rHi20.sym->isGnuIFunc() ||
(ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section)))
return;

uint64_t symBase = 0;
if (rHi20.expr == RE_LOONGARCH_PLT_PAGE_PC)
symBase = rHi20.sym->getPltVA(ctx);
else if (rHi20.expr == RE_LOONGARCH_PAGE_PC ||
rHi20.expr == RE_LOONGARCH_GOT_PAGE_PC)
symBase = rHi20.sym->getVA(ctx);
else {
Err(ctx) << getErrorLoc(ctx, (const uint8_t *)loc) << "unknown expr ("
<< rHi20.expr << ") against symbol " << rHi20.sym
<< "in relaxPCHi20Lo12";
return;
}
const uint64_t symLocal = symBase + rHi20.addend;

const int64_t distance = symLocal - loc;
// Check if the distance aligns 4 bytes or exceeds the range of pcaddi.
if ((distance & 0x3) != 0 || !isInt<22>(distance))
return;

// Note: If we can ensure that the .o files generated by LLVM only contain
// relaxable instruction sequences with R_LARCH_RELAX, then we do not need to
// decode instructions. The relaxable instruction sequences imply the
// following constraints:
// * For relocation pairs related to got_pc, the opcodes of instructions
// must be pcalau12i + ld.w/d. In other cases, the opcodes must be pcalau12i +
// addi.w/d.
// * The destination register of pcalau12i is guaranteed to be used only by
// the immediately following instruction.
const uint32_t currInsn = read32le(sec.content().data() + rHi20.offset);
const uint32_t nextInsn = read32le(sec.content().data() + rLo12.offset);
// Check if use the same register.
if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
return;

sec.relaxAux->relocTypes[i] = R_LARCH_RELAX;
sec.relaxAux->relocTypes[i + 2] = R_LARCH_PCREL20_S2;
sec.relaxAux->writes.push_back(insn(PCADDI, getD5(nextInsn), 0, 0));
remove = 4;
}

// Relax code sequence.
// From:
// pcaddu18i $ra, %call36(foo)
// jirl $ra, $ra, 0
// To:
// b/bl foo
static void relaxCall36(Ctx &ctx, const InputSection &sec, size_t i,
uint64_t loc, Relocation &r, uint32_t &remove) {
const uint64_t symLocal =
(r.expr == R_PLT_PC ? r.sym->getPltVA(ctx) : r.sym->getVA(ctx)) +
r.addend;

const int64_t distance = symLocal - loc;
// Check if the distance aligns 4 bytes or exceeds the range of b[l].
if ((distance & 0x3) != 0 || !isInt<28>(distance))
return;

const uint32_t nextInsn = read32le(sec.content().data() + r.offset + 4);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To match ld.bfd behavior we should check isJirl(nextInsn) before continuing. In #127312 (comment) I commented that we should probably add stronger checks for ld.bfd too, but let's make the current behavior aligned at first.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is not necessary. Here are the reasons:

  1. Existing compilers, whether GCC or Clang, always generate the correct instruction sequence corresponding to the R_LARCH_CALL36 relocation when it is produced.
  2. For handwritten assembly, correct use of pseudo-instructions will not lead to errors. For maliciously written incorrect assembly cases, we do not need to ensure their correctness, as such erroneous cases can always be constructed in most architectures.
  3. Instruction checking should be avoided as much as possible in lld. (https://github.com/llvm/llvm-project/pull/127312/files#r1957198091)

It would be greatly appreciated if you could provide an example of an illegal case or present more substantial reasons.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is not necessary. Here are the reasons:

1. Existing compilers, whether GCC or Clang, always generate the correct instruction sequence corresponding to the R_LARCH_CALL36 relocation when it is produced.

2. For handwritten assembly, correct use of pseudo-instructions will not lead to errors. For maliciously written incorrect assembly cases, we do not need to ensure their correctness, as such erroneous cases can always be constructed in most architectures.

3. Instruction checking should be avoided as much as possible in lld. (https://github.com/llvm/llvm-project/pull/127312/files#r1957198091)

It would be greatly appreciated if you could provide an example of an illegal case or present more substantial reasons.

I think it's probably okay to not add more checks, but for ld.bfd interoperability the check on jirl is necessary. If you think this should not be done either, the ld.bfd implementation likely needs change as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After further consideration and revisiting the manual again(https://github.com/loongson/la-abi-specs/blob/release/laelf.adoc),
image
the generated pcaddu18i and jirl by GCC and Clang must be adjacent.

As for the interoperability you mentioned, lld handles instruction sequences generated by gas. And gas should not generate non-contiguous instruction sequences either. Therefore, we think that adding a check in lld is unnecessary.

Additionally, if jirl check is added here, other relaxations (such as R_LARCH_{PCALA,GOT_PC}_{HI20,LO12}, etc) would also need to check the instructions, which contradicts the principle of avoiding instruction checks unless absolutely necessary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After further consideration and revisiting the manual again(https://github.com/loongson/la-abi-specs/blob/release/laelf.adoc), the generated pcaddu18i and jirl by GCC and Clang must be adjacent.

As for the interoperability you mentioned, lld handles instruction sequences generated by gas. And gas should not generate non-contiguous instruction sequences either. Therefore, we think that adding a check in lld is unnecessary.

Additionally, if jirl check is added here, other relaxations (such as R_LARCH_{PCALA,GOT_PC}_{HI20,LO12}, etc) would also need to check the instructions, which contradicts the principle of avoiding instruction checks unless absolutely necessary.

I'm not objecting to the current revision per se, but rather behavior consistency, no matter whether the input is well-formed or not -- the so-called "bug-for-bug compatibility". This means:

  • either we add a check for jirl here in LLD and exactly replicate ld.bfd behavior, or
  • we leave the code as-is, and remove the check at ld.bfd side, with your comment here as justification,

so eventually we're going to have 2 consistent implementations of LoongArch ELF psABI and happy users. What do you think here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed with developers of ld's loongarch port, but they think the check is necessary in ld and lld don't need to align with it.

How about adding an assertion here as a compromise?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I have added an assertion to check jirl. 52bec2b

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the late reply -- I didn't realize "assertion" is just assert before.

But now debug builds of lld will error out on malformed R_LARCH_CALL36 underlying data -- this doesn't align with ld.bfd either because it will simply skip relaxing this piece instead of dying, so I fear the assertion may do more harm than good 🤷

Copy link
Contributor Author

@ylzsx ylzsx Mar 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed with developers of ld's loongarch port, but they think the check is necessary in ld and lld don't need to align with it.

@MaskRay Would you agree to adding a check for jirl in this situation?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still prefer that we remove the checks. We should avoid assert, when it is actually reachable.

If BFD wants to be guard against malformed input, that's ok. It's fine to diverge from it.

if (getD5(nextInsn) == R_RA) {
// convert jirl to bl
sec.relaxAux->relocTypes[i] = R_LARCH_B26;
sec.relaxAux->writes.push_back(insn(BL, 0, 0, 0));
remove = 4;
} else if (getD5(nextInsn) == R_ZERO) {
// convert jirl to b
sec.relaxAux->relocTypes[i] = R_LARCH_B26;
sec.relaxAux->writes.push_back(insn(B, 0, 0, 0));
remove = 4;
}
}

static bool relax(Ctx &ctx, InputSection &sec) {
const uint64_t secAddr = sec.getVA();
const MutableArrayRef<Relocation> relocs = sec.relocs();
Expand Down Expand Up @@ -781,6 +901,16 @@ static bool relax(Ctx &ctx, InputSection &sec) {
}
break;
}
case R_LARCH_PCALA_HI20:
case R_LARCH_GOT_PC_HI20:
// The overflow check for i+2 will be carried out in isPairRelaxable.
if (isPairRelaxable(relocs, i))
relaxPCHi20Lo12(ctx, sec, i, loc, r, relocs[i + 2], remove);
break;
case R_LARCH_CALL36:
if (relaxable(relocs, i))
relaxCall36(ctx, sec, i, loc, r, remove);
break;
}

// For all anchors whose offsets are <= r.offset, they are preceded by
Expand Down Expand Up @@ -851,6 +981,7 @@ void LoongArch::finalizeRelax(int passes) const {
MutableArrayRef<Relocation> rels = sec->relocs();
ArrayRef<uint8_t> old = sec->content();
size_t newSize = old.size() - aux.relocDeltas[rels.size() - 1];
size_t writesIdx = 0;
uint8_t *p = ctx.bAlloc.Allocate<uint8_t>(newSize);
uint64_t offset = 0;
int64_t delta = 0;
Expand All @@ -867,11 +998,33 @@ void LoongArch::finalizeRelax(int passes) const {
continue;

// Copy from last location to the current relocated location.
const Relocation &r = rels[i];
Relocation &r = rels[i];
uint64_t size = r.offset - offset;
memcpy(p, old.data() + offset, size);
p += size;
offset = r.offset + remove;

int64_t skip = 0;
if (RelType newType = aux.relocTypes[i]) {
switch (newType) {
case R_LARCH_RELAX:
break;
case R_LARCH_PCREL20_S2:
skip = 4;
write32le(p, aux.writes[writesIdx++]);
// RelExpr is needed for relocating.
r.expr = r.sym->hasFlag(NEEDS_PLT) ? R_PLT_PC : R_PC;
break;
case R_LARCH_B26:
skip = 4;
write32le(p, aux.writes[writesIdx++]);
break;
default:
llvm_unreachable("unsupported type");
}
}

p += skip;
offset = r.offset + skip + remove;
}
memcpy(p, old.data() + offset, old.size() - offset);

Expand Down
129 changes: 82 additions & 47 deletions lld/test/ELF/loongarch-relax-align.s
Original file line number Diff line number Diff line change
Expand Up @@ -2,60 +2,95 @@

# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax %s -o %t.32.o
# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.64.o
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.32.o -o %t.32
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.64.o -o %t.64
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.32.o --no-relax -o %t.32n
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.64.o --no-relax -o %t.64n
# RUN: llvm-objdump -td --no-show-raw-insn %t.32 | FileCheck %s
# RUN: llvm-objdump -td --no-show-raw-insn %t.64 | FileCheck %s
# RUN: llvm-objdump -td --no-show-raw-insn %t.32n | FileCheck %s
# RUN: llvm-objdump -td --no-show-raw-insn %t.64n | FileCheck %s
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 --relax %t.32.o -o %t.32
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 --relax %t.64.o -o %t.64
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 --no-relax %t.32.o -o %t.32n
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 --no-relax %t.64.o -o %t.64n
# RUN: llvm-objdump -td --no-show-raw-insn %t.32 | FileCheck --check-prefixes=RELAX,NOOLD %s
# RUN: llvm-objdump -td --no-show-raw-insn %t.64 | FileCheck --check-prefixes=RELAX,NOOLD %s
# RUN: llvm-objdump -td --no-show-raw-insn %t.32n | FileCheck --check-prefixes=NORELAX,NOOLD %s
# RUN: llvm-objdump -td --no-show-raw-insn %t.64n | FileCheck --check-prefixes=NORELAX,NOOLD %s

## Test the R_LARCH_ALIGN without symbol index.
# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.o64.o --defsym=old=1
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.o64.o -o %t.o64
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.o64.o --no-relax -o %t.o64n
# RUN: llvm-objdump -td --no-show-raw-insn %t.o64 | FileCheck %s
# RUN: llvm-objdump -td --no-show-raw-insn %t.o64n | FileCheck %s
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 --relax %t.o64.o -o %t.o64
# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 --no-relax %t.o64.o -o %t.o64n
# RUN: llvm-objdump -td --no-show-raw-insn %t.o64 | FileCheck --check-prefixes=RELAX,OLD %s
# RUN: llvm-objdump -td --no-show-raw-insn %t.o64n | FileCheck --check-prefixes=NORELAX,OLD %s

## -r keeps section contents unchanged.
# RUN: ld.lld -r %t.64.o -o %t.64.r
# RUN: ld.lld -r --relax %t.64.o -o %t.64.r
# RUN: llvm-objdump -dr --no-show-raw-insn %t.64.r | FileCheck %s --check-prefix=CHECKR

# CHECK-DAG: {{0*}}10000 l .text {{0*}}44 .Ltext_start
# CHECK-DAG: {{0*}}10038 l .text {{0*}}0c .L1
# CHECK-DAG: {{0*}}10040 l .text {{0*}}04 .L2
# CHECK-DAG: {{0*}}20000 l .text2 {{0*}}14 .Ltext2_start

# CHECK: <.Ltext_start>:
# CHECK-NEXT: break 1
# CHECK-NEXT: break 2
# CHECK-NEXT: nop
# CHECK-NEXT: nop
# CHECK-NEXT: break 3
# CHECK-NEXT: break 4
# CHECK-NEXT: nop
# CHECK-NEXT: nop
# CHECK-NEXT: pcalau12i $a0, 0
# CHECK-NEXT: addi.{{[dw]}} $a0, $a0, 0
# CHECK-NEXT: pcalau12i $a0, 0
# CHECK-NEXT: addi.{{[dw]}} $a0, $a0, 56
# CHECK-NEXT: pcalau12i $a0, 0
# CHECK-NEXT: addi.{{[dw]}} $a0, $a0, 64
# CHECK-EMPTY:
# CHECK-NEXT: <.L1>:
# CHECK-NEXT: nop
# CHECK-NEXT: nop
# CHECK-EMPTY:
# CHECK-NEXT: <.L2>:
# CHECK-NEXT: break 5

# CHECK: <.Ltext2_start>:
# CHECK-NEXT: pcalau12i $a0, 0
# CHECK-NEXT: addi.{{[dw]}} $a0, $a0, 0
# CHECK-NEXT: nop
# CHECK-NEXT: nop
# CHECK-NEXT: break 6
# NOOLD: {{0*}}10000 l .text {{0*}}00 .Lalign_symbol
# OLD: {{0*}}00001 l *ABS* {{0*}}00 old

# NORELAX-DAG: {{0*}}10000 l .text {{0*}}44 .Ltext_start
# NORELAX-DAG: {{0*}}10038 l .text {{0*}}0c .L1
# NORELAX-DAG: {{0*}}10040 l .text {{0*}}04 .L2
# NORELAX-DAG: {{0*}}20000 l .text2 {{0*}}14 .Ltext2_start

# NORELAX: <.Ltext_start>:
# NORELAX-NEXT: break 1
# NORELAX-NEXT: break 2
# NORELAX-NEXT: nop
# NORELAX-NEXT: nop
# NORELAX-NEXT: break 3
# NORELAX-NEXT: break 4
# NORELAX-NEXT: nop
# NORELAX-NEXT: nop
# NORELAX-NEXT: pcalau12i $a0, 0
# NORELAX-NEXT: addi.{{[dw]}} $a0, $a0, 0
# NORELAX-NEXT: pcalau12i $a0, 0
# NORELAX-NEXT: addi.{{[dw]}} $a0, $a0, 56
# NORELAX-NEXT: pcalau12i $a0, 0
# NORELAX-NEXT: addi.{{[dw]}} $a0, $a0, 64
# NORELAX-EMPTY:
# NORELAX-NEXT: <.L1>:
# NORELAX-NEXT: nop
# NORELAX-NEXT: nop
# NORELAX-EMPTY:
# NORELAX-NEXT: <.L2>:
# NORELAX-NEXT: break 5

# NORELAX: <.Ltext2_start>:
# NORELAX-NEXT: pcalau12i $a0, 0
# NORELAX-NEXT: addi.{{[dw]}} $a0, $a0, 0
# NORELAX-NEXT: nop
# NORELAX-NEXT: nop
# NORELAX-NEXT: break 6


# RELAX-DAG: {{0*}}10000 l .text {{0*}}34 .Ltext_start
# RELAX-DAG: {{0*}}1002c l .text {{0*}}08 .L1
# RELAX-DAG: {{0*}}10030 l .text {{0*}}04 .L2
# RELAX-DAG: {{0*}}20000 l .text2 {{0*}}14 .Ltext2_start

# RELAX: <.Ltext_start>:
# RELAX-NEXT: break 1
# RELAX-NEXT: break 2
# RELAX-NEXT: nop
# RELAX-NEXT: nop
# RELAX-NEXT: break 3
# RELAX-NEXT: break 4
# RELAX-NEXT: nop
# RELAX-NEXT: nop
# RELAX-NEXT: pcaddi $a0, -8
# RELAX-NEXT: pcaddi $a0, 2
# RELAX-NEXT: pcaddi $a0, 2
# RELAX-EMPTY:
# RELAX-NEXT: <.L1>:
# RELAX-NEXT: nop
# RELAX-EMPTY:
# RELAX-NEXT: <.L2>:
# RELAX-NEXT: break 5

# RELAX: <.Ltext2_start>:
# RELAX-NEXT: pcaddi $a0, 0
# RELAX-NEXT: nop
# RELAX-NEXT: nop
# RELAX-NEXT: nop
# RELAX-NEXT: break 6

# CHECKR: <.Ltext2_start>:
# CHECKR-NEXT: pcalau12i $a0, 0
Expand Down
Loading
Loading