Skip to content

[AArch64] fix trampoline implementation: use X15 #126743

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions compiler-rt/lib/builtins/README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -272,11 +272,6 @@ switch32
switch8
switchu8

// This function generates a custom trampoline function with the specific
// realFunc and localsPtr values.
void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
const void* realFunc, void* localsPtr);

// There is no C interface to the *_vfp_d8_d15_regs functions. There are
// called in the prolog and epilog of Thumb1 functions. When the C++ ABI use
// SJLJ for exceptions, each function with a catch clause or destructors needs
Expand Down
42 changes: 0 additions & 42 deletions compiler-rt/lib/builtins/trampoline_setup.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,45 +41,3 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
__clear_cache(trampOnStack, &trampOnStack[10]);
}
#endif // __powerpc__ && !defined(__powerpc64__)

// The AArch64 compiler generates calls to __trampoline_setup() when creating
// trampoline functions on the stack for use with nested functions.
// This function creates a custom 36-byte trampoline function on the stack
// which loads x18 with a pointer to the outer function's locals
// and then jumps to the target nested function.
// Note: x18 is a reserved platform register on Windows and macOS.

#if defined(__aarch64__) && defined(__ELF__)
COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
int trampSizeAllocated,
const void *realFunc, void *localsPtr) {
// This should never happen, but if compiler did not allocate
// enough space on stack for the trampoline, abort.
if (trampSizeAllocated < 36)
compilerrt_abort();

// create trampoline
// Load realFunc into x17. mov/movk 16 bits at a time.
trampOnStack[0] =
0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
trampOnStack[1] =
0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
trampOnStack[2] =
0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
trampOnStack[3] =
0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
// Load localsPtr into x18
trampOnStack[4] =
0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
trampOnStack[5] =
0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
trampOnStack[6] =
0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
trampOnStack[7] =
0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
trampOnStack[8] = 0xd61f0220; // br x17

// Clear instruction cache.
__clear_cache(trampOnStack, &trampOnStack[9]);
}
#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)
2 changes: 1 addition & 1 deletion compiler-rt/test/builtins/Unit/trampoline_setup_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

/*
* Tests nested functions
* The ppc and aarch64 compilers generates a call to __trampoline_setup
* The ppc compiler generates a call to __trampoline_setup
* The i386 and x86_64 compilers generate a call to ___enable_execute_stack
*/

Expand Down
8 changes: 4 additions & 4 deletions flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,12 +274,12 @@ class BoxedProcedurePass
auto loc = embox.getLoc();
mlir::Type i8Ty = builder.getI8Type();
mlir::Type i8Ptr = builder.getRefType(i8Ty);
// For AArch64, PPC32 and PPC64, the thunk is populated by a call to
// For PPC32 and PPC64, the thunk is populated by a call to
// __trampoline_setup, which is defined in
// compiler-rt/lib/builtins/trampoline_setup.c and requires the
// thunk size greater than 32 bytes. For RISCV and x86_64, the
// thunk setup doesn't go through __trampoline_setup and fits in 32
// bytes.
// thunk size greater than 32 bytes. For AArch64, RISCV and x86_64,
// the thunk setup doesn't go through __trampoline_setup and fits in
// 32 bytes.
fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
auto buffer = builder.create<AllocaOp>(loc, buffTy);
Expand Down
4 changes: 2 additions & 2 deletions flang/test/Fir/boxproc.fir
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}

// CHECK-LABEL: define void @_QPtest_proc_dummy()
// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
// CHECK-X86: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
// CHECK-PPC: %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
// CHECK: %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
Expand Down Expand Up @@ -63,7 +63,7 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
}

// CHECK-LABEL: define void @_QPtest_proc_dummy_char()
// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
// CHECK-X86: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
// CHECK-PPC: %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
// CHECK: %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8
Expand Down
25 changes: 18 additions & 7 deletions llvm/lib/Target/AArch64/AArch64CallingConvention.td
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ class CCIfSubtarget<string F, CCAction A>
//===----------------------------------------------------------------------===//

defvar AArch64_Common = [
// The 'nest' parameter, if any, is passed in X15.
// The previous register used here (X18) is also defined to be unavailable
// for this purpose, while all of X9-X15 were defined to be free for LLVM to
// use for this, so use X15 (which LLVM often already clobbers anyways).
CCIfNest<CCAssignToReg<[X15]>>,

CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
Expand Down Expand Up @@ -117,13 +123,7 @@ defvar AArch64_Common = [
];

let Entry = 1 in
def CC_AArch64_AAPCS : CallingConv<!listconcat(
// The 'nest' parameter, if any, is passed in X18.
// Darwin and Windows use X18 as the platform register and hence 'nest' isn't
// currently supported there.
[CCIfNest<CCAssignToReg<[X18]>>],
AArch64_Common
)>;
def CC_AArch64_AAPCS : CallingConv<AArch64_Common>;

let Entry = 1 in
def RetCC_AArch64_AAPCS : CallingConv<[
Expand Down Expand Up @@ -177,6 +177,8 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
// a stack layout compatible with the x64 calling convention.
let Entry = 1 in
def CC_AArch64_Arm64EC_VarArg : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

// Convert small floating-point values to integer.
CCIfType<[f16, bf16], CCBitConvertToType<i16>>,
CCIfType<[f32], CCBitConvertToType<i32>>,
Expand Down Expand Up @@ -353,6 +355,8 @@ def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
// + Stack slots are sized as needed rather than being at least 64-bit.
let Entry = 1 in
def CC_AArch64_DarwinPCS : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
Expand Down Expand Up @@ -427,6 +431,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[

let Entry = 1 in
def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
Expand All @@ -450,6 +456,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
// same as the normal Darwin VarArgs handling.
let Entry = 1 in
def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,

Expand Down Expand Up @@ -494,6 +502,8 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[

let Entry = 1 in
def CC_AArch64_GHC : CallingConv<[
CCIfNest<CCAssignToReg<[X15]>>,

CCIfType<[iPTR], CCBitConvertToType<i64>>,

// Handle all vector types as either f64 or v2f64.
Expand Down Expand Up @@ -522,6 +532,7 @@ def CC_AArch64_Preserve_None : CallingConv<[

// We can pass arguments in all general registers, except:
// - X8, used for sret
// - X15 (on Windows), used as a temporary register in the prologue when allocating call frames
// - X16/X17, used by the linker as IP0/IP1
// - X18, the platform register
// - X19, the base pointer
Expand Down
79 changes: 58 additions & 21 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,8 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
static bool produceCompactUnwindFrame(MachineFunction &MF);
static bool needsWinCFI(const MachineFunction &MF);
static StackOffset getSVEStackSize(const MachineFunction &MF);
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB);
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, bool HasCall=false);
static bool requiresSaveVG(const MachineFunction &MF);

/// Returns true if a homogeneous prolog or epilog code can be emitted
/// for the size optimization. If possible, a frame helper call is injected.
Expand Down Expand Up @@ -1002,6 +1003,16 @@ void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
}
}

static bool windowsRequiresStackProbe(const MachineFunction &MF,
uint64_t StackSizeInBytes) {
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
// TODO: When implementing stack protectors, take that into account
// for the probe threshold.
return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
}

static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
const MachineBasicBlock &MBB) {
const MachineFunction *MF = MBB.getParent();
Expand All @@ -1023,7 +1034,7 @@ static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
// but we would then have to make sure that we were in fact saving at least one
// callee-save register in the prologue, which is additional complexity that
// doesn't seem worth the benefit.
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, bool HasCall) {
MachineFunction *MF = MBB->getParent();

// If MBB is an entry block, use X9 as the scratch register
Expand All @@ -1037,6 +1048,11 @@ static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
LivePhysRegs LiveRegs(TRI);
getLiveRegsForEntryMBB(LiveRegs, *MBB);
if (HasCall) {
LiveRegs.addReg(AArch64::X16);
LiveRegs.addReg(AArch64::X17);
LiveRegs.addReg(AArch64::X18);
}

// Prefer X9 since it was historically used for the prologue scratch reg.
const MachineRegisterInfo &MRI = MF->getRegInfo();
Expand Down Expand Up @@ -1077,23 +1093,16 @@ bool AArch64FrameLowering::canUseAsPrologue(
MBB.isLiveIn(AArch64::NZCV))
return false;

// Don't need a scratch register if we're not going to re-align the stack or
// emit stack probes.
if (!RegInfo->hasStackRealignment(*MF) && !TLI->hasInlineStackProbe(*MF))
return true;
// Otherwise, we can use any block as long as it has a scratch register
// available.
return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
}
if (RegInfo->hasStackRealignment(*MF) || TLI->hasInlineStackProbe(*MF))
if (findScratchNonCalleeSaveRegister(TmpMBB) == AArch64::NoRegister)
return false;

static bool windowsRequiresStackProbe(MachineFunction &MF,
uint64_t StackSizeInBytes) {
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
// TODO: When implementing stack protectors, take that into account
// for the probe threshold.
return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
// May need a scratch register (for return value) if require making a special call
if (requiresSaveVG(*MF) || windowsRequiresStackProbe(*MF, std::numeric_limits<uint64_t>::max()))
if (findScratchNonCalleeSaveRegister(TmpMBB, true) == AArch64::NoRegister)
return false;

return true;
}

static bool needsWinCFI(const MachineFunction &MF) {
Expand Down Expand Up @@ -1356,8 +1365,8 @@ bool requiresGetVGCall(MachineFunction &MF) {
!MF.getSubtarget<AArch64Subtarget>().hasSVE();
}

static bool requiresSaveVG(MachineFunction &MF) {
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
static bool requiresSaveVG(const MachineFunction &MF) {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// For Darwin platforms we don't save VG for non-SVE functions, even if SME
// is enabled with streaming mode changes.
if (!AFI->hasStreamingModeChanges())
Expand Down Expand Up @@ -1982,6 +1991,27 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
: 0;

if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) {
// Find an available register to spill the value of X15 to, if X15 is being
// used already for nest.
unsigned X15Scratch = AArch64::NoRegister;
const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
if (llvm::any_of(MBB.liveins(),
[&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
AArch64::X15, LiveIn.PhysReg);
})) {
X15Scratch = findScratchNonCalleeSaveRegister(&MBB, true);
assert(X15Scratch != AArch64::NoRegister && (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17));
#ifndef NDEBUG
LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
#endif
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
.addReg(AArch64::XZR)
.addReg(AArch64::X15, RegState::Undef)
.addReg(AArch64::X15, RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);
}

uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
if (NeedsWinCFI) {
HasWinCFI = true;
Expand Down Expand Up @@ -2104,6 +2134,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// we've set a frame pointer and already finished the SEH prologue.
assert(!NeedsWinCFI);
}
if (X15Scratch != AArch64::NoRegister) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
.addReg(AArch64::XZR)
.addReg(X15Scratch, RegState::Undef)
.addReg(X15Scratch, RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);
}
}

StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
Expand Down Expand Up @@ -3208,7 +3245,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
unsigned X0Scratch = AArch64::NoRegister;
if (Reg1 == AArch64::VG) {
// Find an available register to store value of VG to.
Reg1 = findScratchNonCalleeSaveRegister(&MBB);
Reg1 = findScratchNonCalleeSaveRegister(&MBB, true);
assert(Reg1 != AArch64::NoRegister);
SMEAttrs Attrs(MF.getFunction());

Expand Down
Loading
Loading