Skip to content

Commit 4053196

Browse files
committed
[AArch64] fix trampoline implementation: use X15
AAPCS64 reserves any of X9-X15 for this purpose, and says not to use any of X16-X18 (like GCC chose). Simply choosing a different register fixes the problem of this being broken on any platform that actually follows the platform ABI. As a side benefit, also generate slightly better code in the trampoline itself by following the XCore implementation instead of PPC (although following the RISCV might have been slightly more readable in hindsight).
1 parent 764e0cc commit 4053196

File tree

14 files changed

+385
-146
lines changed

14 files changed

+385
-146
lines changed

compiler-rt/lib/builtins/README.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -272,11 +272,6 @@ switch32
272272
switch8
273273
switchu8
274274

275-
// This function generates a custom trampoline function with the specific
276-
// realFunc and localsPtr values.
277-
void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
278-
const void* realFunc, void* localsPtr);
279-
280275
// There is no C interface to the *_vfp_d8_d15_regs functions. There are
281276
// called in the prolog and epilog of Thumb1 functions. When the C++ ABI use
282277
// SJLJ for exceptions, each function with a catch clause or destructors needs

compiler-rt/lib/builtins/trampoline_setup.c

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -41,45 +41,3 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
4141
__clear_cache(trampOnStack, &trampOnStack[10]);
4242
}
4343
#endif // __powerpc__ && !defined(__powerpc64__)
44-
45-
// The AArch64 compiler generates calls to __trampoline_setup() when creating
46-
// trampoline functions on the stack for use with nested functions.
47-
// This function creates a custom 36-byte trampoline function on the stack
48-
// which loads x18 with a pointer to the outer function's locals
49-
// and then jumps to the target nested function.
50-
// Note: x18 is a reserved platform register on Windows and macOS.
51-
52-
#if defined(__aarch64__) && defined(__ELF__)
53-
COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
54-
int trampSizeAllocated,
55-
const void *realFunc, void *localsPtr) {
56-
// This should never happen, but if compiler did not allocate
57-
// enough space on stack for the trampoline, abort.
58-
if (trampSizeAllocated < 36)
59-
compilerrt_abort();
60-
61-
// create trampoline
62-
// Load realFunc into x17. mov/movk 16 bits at a time.
63-
trampOnStack[0] =
64-
0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
65-
trampOnStack[1] =
66-
0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
67-
trampOnStack[2] =
68-
0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
69-
trampOnStack[3] =
70-
0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
71-
// Load localsPtr into x18
72-
trampOnStack[4] =
73-
0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
74-
trampOnStack[5] =
75-
0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
76-
trampOnStack[6] =
77-
0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
78-
trampOnStack[7] =
79-
0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
80-
trampOnStack[8] = 0xd61f0220; // br x17
81-
82-
// Clear instruction cache.
83-
__clear_cache(trampOnStack, &trampOnStack[9]);
84-
}
85-
#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)

compiler-rt/test/builtins/Unit/trampoline_setup_test.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
/*
99
* Tests nested functions
10-
* The ppc and aarch64 compilers generates a call to __trampoline_setup
10+
* The ppc compiler generates a call to __trampoline_setup
1111
* The i386 and x86_64 compilers generate a call to ___enable_execute_stack
1212
*/
1313

flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -274,12 +274,12 @@ class BoxedProcedurePass
274274
auto loc = embox.getLoc();
275275
mlir::Type i8Ty = builder.getI8Type();
276276
mlir::Type i8Ptr = builder.getRefType(i8Ty);
277-
// For AArch64, PPC32 and PPC64, the thunk is populated by a call to
277+
// For PPC32 and PPC64, the thunk is populated by a call to
278278
// __trampoline_setup, which is defined in
279279
// compiler-rt/lib/builtins/trampoline_setup.c and requires the
280-
// thunk size greater than 32 bytes. For RISCV and x86_64, the
281-
// thunk setup doesn't go through __trampoline_setup and fits in 32
282-
// bytes.
280+
// thunk size greater than 32 bytes. For AArch64, RISCV and x86_64,
281+
// the thunk setup doesn't go through __trampoline_setup and fits in
282+
// 32 bytes.
283283
fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
284284
mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
285285
auto buffer = builder.create<AllocaOp>(loc, buffTy);

flang/test/Fir/boxproc.fir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}
44

55
// CHECK-LABEL: define void @_QPtest_proc_dummy()
6-
// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
6+
// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
77
// CHECK-X86: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
88
// CHECK-PPC: %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
99
// CHECK: %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
@@ -63,7 +63,7 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
6363
}
6464

6565
// CHECK-LABEL: define void @_QPtest_proc_dummy_char()
66-
// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
66+
// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
6767
// CHECK-X86: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
6868
// CHECK-PPC: %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
6969
// CHECK: %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8

llvm/lib/Target/AArch64/AArch64CallingConvention.td

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ class CCIfSubtarget<string F, CCAction A>
2828
//===----------------------------------------------------------------------===//
2929

3030
defvar AArch64_Common = [
31+
// The 'nest' parameter, if any, is passed in X15.
32+
// The previous register used here (X18) is also defined to be unavailable
33+
// for this purpose, while all of X9-X15 were defined to be free for LLVM to
34+
// use for this, so use X15 (which LLVM often already clobbers anyways).
35+
CCIfNest<CCAssignToReg<[X15]>>,
36+
3137
CCIfType<[iPTR], CCBitConvertToType<i64>>,
3238
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
3339
CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
@@ -117,13 +123,7 @@ defvar AArch64_Common = [
117123
];
118124

119125
let Entry = 1 in
120-
def CC_AArch64_AAPCS : CallingConv<!listconcat(
121-
// The 'nest' parameter, if any, is passed in X18.
122-
// Darwin and Windows use X18 as the platform register and hence 'nest' isn't
123-
// currently supported there.
124-
[CCIfNest<CCAssignToReg<[X18]>>],
125-
AArch64_Common
126-
)>;
126+
def CC_AArch64_AAPCS : CallingConv<AArch64_Common>;
127127

128128
let Entry = 1 in
129129
def RetCC_AArch64_AAPCS : CallingConv<[
@@ -177,6 +177,8 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
177177
// a stack layout compatible with the x64 calling convention.
178178
let Entry = 1 in
179179
def CC_AArch64_Arm64EC_VarArg : CallingConv<[
180+
CCIfNest<CCAssignToReg<[X15]>>,
181+
180182
// Convert small floating-point values to integer.
181183
CCIfType<[f16, bf16], CCBitConvertToType<i16>>,
182184
CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -353,6 +355,8 @@ def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
353355
// + Stack slots are sized as needed rather than being at least 64-bit.
354356
let Entry = 1 in
355357
def CC_AArch64_DarwinPCS : CallingConv<[
358+
CCIfNest<CCAssignToReg<[X15]>>,
359+
356360
CCIfType<[iPTR], CCBitConvertToType<i64>>,
357361
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
358362
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -427,6 +431,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
427431

428432
let Entry = 1 in
429433
def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
434+
CCIfNest<CCAssignToReg<[X15]>>,
435+
430436
CCIfType<[iPTR], CCBitConvertToType<i64>>,
431437
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
432438
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -450,6 +456,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
450456
// same as the normal Darwin VarArgs handling.
451457
let Entry = 1 in
452458
def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
459+
CCIfNest<CCAssignToReg<[X15]>>,
460+
453461
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
454462
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
455463

@@ -494,6 +502,8 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
494502

495503
let Entry = 1 in
496504
def CC_AArch64_GHC : CallingConv<[
505+
CCIfNest<CCAssignToReg<[X15]>>,
506+
497507
CCIfType<[iPTR], CCBitConvertToType<i64>>,
498508

499509
// Handle all vector types as either f64 or v2f64.
@@ -522,6 +532,7 @@ def CC_AArch64_Preserve_None : CallingConv<[
522532

523533
// We can pass arguments in all general registers, except:
524534
// - X8, used for sret
535+
// - X15 (on Windows), used as a temporary register in the prologue when allocating call frames
525536
// - X16/X17, used by the linker as IP0/IP1
526537
// - X18, the platform register
527538
// - X19, the base pointer

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1982,6 +1982,27 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
19821982
: 0;
19831983

19841984
if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) {
1985+
// Find an available register to spill the value of X15 to, if X15 is being
1986+
// used already for nest.
1987+
unsigned X15Scratch = AArch64::NoRegister;
1988+
const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
1989+
if (llvm::any_of(MBB.liveins(),
1990+
[&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
1991+
return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
1992+
AArch64::X15, LiveIn.PhysReg);
1993+
})) {
1994+
X15Scratch = findScratchNonCalleeSaveRegister(&MBB);
1995+
assert(X15Scratch != AArch64::NoRegister);
1996+
#ifndef NDEBUG
1997+
LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
1998+
#endif
1999+
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
2000+
.addReg(AArch64::XZR)
2001+
.addReg(AArch64::X15, RegState::Undef)
2002+
.addReg(AArch64::X15, RegState::Implicit)
2003+
.setMIFlag(MachineInstr::FrameSetup);
2004+
}
2005+
19852006
uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
19862007
if (NeedsWinCFI) {
19872008
HasWinCFI = true;
@@ -2104,6 +2125,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
21042125
// we've set a frame pointer and already finished the SEH prologue.
21052126
assert(!NeedsWinCFI);
21062127
}
2128+
if (X15Scratch != AArch64::NoRegister) {
2129+
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
2130+
.addReg(AArch64::XZR)
2131+
.addReg(X15Scratch, RegState::Undef)
2132+
.addReg(X15Scratch, RegState::Implicit)
2133+
.setMIFlag(MachineInstr::FrameSetup);
2134+
}
21072135
}
21082136

21092137
StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 59 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -7339,59 +7339,80 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
73397339

73407340
SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
73417341
SelectionDAG &DAG) const {
7342-
// Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7343-
if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7344-
report_fatal_error(
7345-
"ADJUST_TRAMPOLINE operation is only supported on Linux.");
7346-
73477342
return Op.getOperand(0);
73487343
}
73497344

73507345
SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
73517346
SelectionDAG &DAG) const {
7352-
7353-
// Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7354-
if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7355-
report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
7356-
73577347
SDValue Chain = Op.getOperand(0);
7358-
SDValue Trmp = Op.getOperand(1); // trampoline
7348+
SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
73597349
SDValue FPtr = Op.getOperand(2); // nested function
73607350
SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7361-
SDLoc dl(Op);
73627351

7363-
EVT PtrVT = getPointerTy(DAG.getDataLayout());
7364-
Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
7352+
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
73657353

7366-
TargetLowering::ArgListTy Args;
7367-
TargetLowering::ArgListEntry Entry;
7354+
// ldr NestReg, .+16
7355+
// ldr x17, .+20
7356+
// br x17
7357+
// .word 0
7358+
// .nest: .qword nest
7359+
// .fptr: .qword fptr
7360+
SDValue OutChains[5];
73687361

7369-
Entry.Ty = IntPtrTy;
7370-
Entry.Node = Trmp;
7371-
Args.push_back(Entry);
7362+
const Function *Func =
7363+
cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7364+
CallingConv::ID CC = Func->getCallingConv();
7365+
unsigned NestReg;
73727366

7373-
if (auto *FI = dyn_cast<FrameIndexSDNode>(Trmp.getNode())) {
7374-
MachineFunction &MF = DAG.getMachineFunction();
7375-
MachineFrameInfo &MFI = MF.getFrameInfo();
7376-
Entry.Node =
7377-
DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64);
7378-
} else
7379-
Entry.Node = DAG.getConstant(36, dl, MVT::i64);
7367+
switch (CC) {
7368+
default:
7369+
NestReg = 0x0f; // X15
7370+
case CallingConv::ARM64EC_Thunk_Native:
7371+
case CallingConv::ARM64EC_Thunk_X64:
7372+
// Must be kept in sync with AArch64CallingConv.td
7373+
NestReg = 0x04; // X4
7374+
break;
7375+
}
73807376

7381-
Args.push_back(Entry);
7382-
Entry.Node = FPtr;
7383-
Args.push_back(Entry);
7384-
Entry.Node = Nest;
7385-
Args.push_back(Entry);
7377+
const char FptrReg = 0x11; // X17
73867378

7387-
// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
7388-
TargetLowering::CallLoweringInfo CLI(DAG);
7389-
CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
7390-
CallingConv::C, Type::getVoidTy(*DAG.getContext()),
7391-
DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
7379+
SDValue Addr = Trmp;
73927380

7393-
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
7394-
return CallResult.second;
7381+
SDLoc dl(Op);
7382+
OutChains[0] = DAG.getStore(
7383+
Chain, dl, DAG.getConstant(0x58000080u | NestReg, dl, MVT::i32), Addr,
7384+
MachinePointerInfo(TrmpAddr));
7385+
7386+
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7387+
DAG.getConstant(4, dl, MVT::i64));
7388+
OutChains[1] = DAG.getStore(
7389+
Chain, dl, DAG.getConstant(0x580000b0u | FptrReg, dl, MVT::i32), Addr,
7390+
MachinePointerInfo(TrmpAddr, 4));
7391+
7392+
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7393+
DAG.getConstant(8, dl, MVT::i64));
7394+
OutChains[2] =
7395+
DAG.getStore(Chain, dl, DAG.getConstant(0xd61f0220u, dl, MVT::i32), Addr,
7396+
MachinePointerInfo(TrmpAddr, 8));
7397+
7398+
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7399+
DAG.getConstant(16, dl, MVT::i64));
7400+
OutChains[3] =
7401+
DAG.getStore(Chain, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7402+
7403+
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7404+
DAG.getConstant(24, dl, MVT::i64));
7405+
OutChains[4] =
7406+
DAG.getStore(Chain, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7407+
7408+
SDValue StoreToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
7409+
7410+
SDValue EndOfTrmp = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7411+
DAG.getConstant(12, dl, MVT::i64));
7412+
7413+
// Call clear cache on the trampoline instructions.
7414+
return DAG.getNode(ISD::CLEAR_CACHE, dl, MVT::Other, StoreToken, Trmp,
7415+
EndOfTrmp);
73957416
}
73967417

73977418
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,

llvm/lib/TargetParser/Triple.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1732,8 +1732,6 @@ unsigned Triple::getTrampolineSize() const {
17321732
if (isOSLinux())
17331733
return 48;
17341734
break;
1735-
case Triple::aarch64:
1736-
return 36;
17371735
}
17381736
return 32;
17391737
}

llvm/test/CodeGen/AArch64/nest-register.ll

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,26 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc -disable-post-ra -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
23

34
; Tests that the 'nest' parameter attribute causes the relevant parameter to be
45
; passed in the right register.
56

67
define ptr @nest_receiver(ptr nest %arg) nounwind {
78
; CHECK-LABEL: nest_receiver:
8-
; CHECK-NEXT: // %bb.0:
9-
; CHECK-NEXT: mov x0, x18
10-
; CHECK-NEXT: ret
9+
; CHECK: // %bb.0:
10+
; CHECK-NEXT: mov x0, x15
11+
; CHECK-NEXT: ret
1112

1213
ret ptr %arg
1314
}
1415

1516
define ptr @nest_caller(ptr %arg) nounwind {
1617
; CHECK-LABEL: nest_caller:
17-
; CHECK: mov x18, x0
18-
; CHECK-NEXT: bl nest_receiver
19-
; CHECK: ret
18+
; CHECK: // %bb.0:
19+
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
20+
; CHECK-NEXT: mov x15, x0
21+
; CHECK-NEXT: bl nest_receiver
22+
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
23+
; CHECK-NEXT: ret
2024

2125
%result = call ptr @nest_receiver(ptr nest %arg)
2226
ret ptr %result

llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ define void @test_attributes(ptr byval(%struct2) %s) gc "statepoint-example" {
207207
; CHECK-NEXT: .cfi_offset w30, -16
208208
; CHECK-NEXT: ldr x8, [sp, #64]
209209
; CHECK-NEXT: ldr q0, [sp, #48]
210-
; CHECK-NEXT: mov x18, xzr
210+
; CHECK-NEXT: mov x15, xzr
211211
; CHECK-NEXT: mov w0, #42 // =0x2a
212212
; CHECK-NEXT: mov w1, #17 // =0x11
213213
; CHECK-NEXT: str x8, [sp, #16]

0 commit comments

Comments
 (0)