Skip to content

Commit d297399

Browse files
committed
[AArch64][SME] Enable TPIDR2 lazy-save for za_preserved
This change makes callees with the __arm_preserves_za type attribute comply with the dormant state requirements when it's caller has the __arm_shared_za type attribute. Several external SME functions also do not need to lazy save. https://github.com/ARM-software/abi-aa/blob/5e67092434b50c04f8ad178a9c272ce3c6ada7fd/aapcs64/aapcs64.rst?plain=1#L1381 Differential Revision: https://reviews.llvm.org/D159186
1 parent 257eb74 commit d297399

File tree

4 files changed

+111
-56
lines changed

4 files changed

+111
-56
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 45 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -4823,17 +4823,6 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
48234823
Mask);
48244824
}
48254825

4826-
static std::optional<SMEAttrs> getCalleeAttrsFromExternalFunction(SDValue V) {
4827-
if (auto *ES = dyn_cast<ExternalSymbolSDNode>(V)) {
4828-
StringRef S(ES->getSymbol());
4829-
if (S == "__arm_sme_state" || S == "__arm_tpidr2_save")
4830-
return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved);
4831-
if (S == "__arm_tpidr2_restore")
4832-
return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared);
4833-
}
4834-
return std::nullopt;
4835-
}
4836-
48374826
SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
48384827
SelectionDAG &DAG) const {
48394828
unsigned IntNo = Op.getConstantOperandVal(1);
@@ -7375,28 +7364,31 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
73757364
SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
73767365
if (CLI.CB)
73777366
CalleeAttrs = SMEAttrs(*CLI.CB);
7378-
else if (std::optional<SMEAttrs> Attrs =
7379-
getCalleeAttrsFromExternalFunction(CLI.Callee))
7380-
CalleeAttrs = *Attrs;
7367+
else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7368+
CalleeAttrs = SMEAttrs(ES->getSymbol());
73817369

73827370
bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
7383-
7384-
MachineFrameInfo &MFI = MF.getFrameInfo();
73857371
if (RequiresLazySave) {
7386-
// Set up a lazy save mechanism by storing the runtime live slices
7387-
// (worst-case N*N) to the TPIDR2 stack object.
7388-
SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7389-
DAG.getConstant(1, DL, MVT::i32));
7390-
SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
7391-
unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
7372+
SDValue NumZaSaveSlices;
7373+
if (!CalleeAttrs.preservesZA()) {
7374+
// Set up a lazy save mechanism by storing the runtime live slices
7375+
// (worst-case SVL*SVL) to the TPIDR2 stack object.
7376+
SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7377+
DAG.getConstant(1, DL, MVT::i32));
7378+
NumZaSaveSlices = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
7379+
} else if (CalleeAttrs.preservesZA()) {
7380+
NumZaSaveSlices = DAG.getConstant(0, DL, MVT::i64);
7381+
}
73927382

7383+
unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
73937384
MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
73947385
SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
73957386
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7396-
SDValue BufferPtrAddr =
7387+
SDValue NumZaSaveSlicesAddr =
73977388
DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
73987389
DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
7399-
Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16);
7390+
Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
7391+
MPI, MVT::i16);
74007392
Chain = DAG.getNode(
74017393
ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
74027394
DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
@@ -7503,6 +7495,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
75037495

75047496
Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
75057497
Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
7498+
MachineFrameInfo &MFI = MF.getFrameInfo();
75067499
int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
75077500
if (isScalable)
75087501
MFI.setStackID(FI, TargetStackID::ScalableVector);
@@ -7819,35 +7812,34 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
78197812
}
78207813

78217814
if (RequiresLazySave) {
7822-
// Unconditionally resume ZA.
7823-
Result = DAG.getNode(
7824-
AArch64ISD::SMSTART, DL, MVT::Other, Result,
7825-
DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
7826-
DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
7827-
7828-
// Conditionally restore the lazy save using a pseudo node.
7829-
unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
7830-
SDValue RegMask = DAG.getRegisterMask(
7831-
TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
7832-
SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
7833-
"__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
7834-
SDValue TPIDR2_EL0 = DAG.getNode(
7835-
ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
7836-
DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
7837-
7838-
// Copy the address of the TPIDR2 block into X0 before 'calling' the
7839-
// RESTORE_ZA pseudo.
7840-
SDValue Glue;
7841-
SDValue TPIDR2Block = DAG.getFrameIndex(
7842-
FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7843-
Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
7844-
Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
7845-
{Result, TPIDR2_EL0,
7846-
DAG.getRegister(AArch64::X0, MVT::i64),
7847-
RestoreRoutine,
7848-
RegMask,
7849-
Result.getValue(1)});
7850-
7815+
if (!CalleeAttrs.preservesZA()) {
7816+
// Unconditionally resume ZA.
7817+
Result = DAG.getNode(
7818+
AArch64ISD::SMSTART, DL, MVT::Other, Result,
7819+
DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
7820+
DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
7821+
7822+
// Conditionally restore the lazy save using a pseudo node.
7823+
unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
7824+
SDValue RegMask = DAG.getRegisterMask(
7825+
TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
7826+
SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
7827+
"__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
7828+
SDValue TPIDR2_EL0 = DAG.getNode(
7829+
ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
7830+
DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
7831+
7832+
// Copy the address of the TPIDR2 block into X0 before 'calling' the
7833+
// RESTORE_ZA pseudo.
7834+
SDValue Glue;
7835+
SDValue TPIDR2Block = DAG.getFrameIndex(
7836+
FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7837+
Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
7838+
Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
7839+
{Result, TPIDR2_EL0,
7840+
DAG.getRegister(AArch64::X0, MVT::i64),
7841+
RestoreRoutine, RegMask, Result.getValue(1)});
7842+
}
78517843
// Finally reset the TPIDR2_EL0 register to 0.
78527844
Result = DAG.getNode(
78537845
ISD::INTRINSIC_VOID, DL, MVT::Other, Result,

llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,26 @@ void SMEAttrs::set(unsigned M, bool Enable) {
2424
"ZA_New and ZA_Shared are mutually exclusive");
2525
assert(!(hasNewZABody() && preservesZA()) &&
2626
"ZA_New and ZA_Preserved are mutually exclusive");
27+
assert(!(hasNewZABody() && (Bitmask & ZA_NoLazySave)) &&
28+
"ZA_New and ZA_NoLazySave are mutually exclusive");
29+
assert(!(hasSharedZAInterface() && (Bitmask & ZA_NoLazySave)) &&
30+
"ZA_Shared and ZA_NoLazySave are mutually exclusive");
2731
}
2832

2933
SMEAttrs::SMEAttrs(const CallBase &CB) {
3034
*this = SMEAttrs(CB.getAttributes());
31-
if (auto *F = CB.getCalledFunction())
32-
set(SMEAttrs(*F).Bitmask);
35+
if (auto *F = CB.getCalledFunction()) {
36+
set(SMEAttrs(*F).Bitmask | SMEAttrs(F->getName()).Bitmask);
37+
}
38+
}
39+
40+
SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) {
41+
if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state")
42+
Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved |
43+
SMEAttrs::ZA_NoLazySave);
44+
if (FuncName == "__arm_tpidr2_restore")
45+
Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared |
46+
SMEAttrs::ZA_NoLazySave);
3347
}
3448

3549
SMEAttrs::SMEAttrs(const AttributeList &Attrs) {

llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,15 @@ class SMEAttrs {
3535
ZA_Shared = 1 << 3, // aarch64_pstate_sm_shared
3636
ZA_New = 1 << 4, // aarch64_pstate_sm_new
3737
ZA_Preserved = 1 << 5, // aarch64_pstate_sm_preserved
38+
ZA_NoLazySave = 1 << 6, // Used for SME ABI routines to avoid lazy saves
3839
All = ZA_Preserved - 1
3940
};
4041

4142
SMEAttrs(unsigned Mask = Normal) : Bitmask(0) { set(Mask); }
4243
SMEAttrs(const Function &F) : SMEAttrs(F.getAttributes()) {}
4344
SMEAttrs(const CallBase &CB);
4445
SMEAttrs(const AttributeList &L);
46+
SMEAttrs(StringRef FuncName);
4547

4648
void set(unsigned M, bool Enable = true);
4749

@@ -82,7 +84,7 @@ class SMEAttrs {
8284
}
8385
bool requiresLazySave(const SMEAttrs &Callee) const {
8486
return hasZAState() && Callee.hasPrivateZAInterface() &&
85-
!Callee.preservesZA();
87+
!(Callee.Bitmask & ZA_NoLazySave);
8688
}
8789
};
8890

llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s
33

44
declare void @private_za_callee()
5+
declare void @private_za_preserved_callee() "aarch64_pstate_za_preserved"
56
declare float @llvm.cos.f32(float)
67

78
; Test lazy-save mechanism for a single callee.
@@ -165,3 +166,49 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z
165166
call void @private_za_callee()
166167
ret void
167168
}
169+
170+
171+
; Test lazy-save mechanism for an aarch64_pstate_za_shared caller
172+
; calling a callee with aarch64_pstate_za_preserved.
173+
define void @za_shared_caller_za_preserved_callee() nounwind "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" {
174+
; CHECK-LABEL: za_shared_caller_za_preserved_callee:
175+
; CHECK: // %bb.0:
176+
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
177+
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
178+
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
179+
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
180+
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
181+
; CHECK-NEXT: add x29, sp, #64
182+
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
183+
; CHECK-NEXT: sub sp, sp, #16
184+
; CHECK-NEXT: rdsvl x8, #1
185+
; CHECK-NEXT: mov x9, sp
186+
; CHECK-NEXT: msub x8, x8, x8, x9
187+
; CHECK-NEXT: mov sp, x8
188+
; CHECK-NEXT: stur x8, [x29, #-80]
189+
; CHECK-NEXT: sub x8, x29, #80
190+
; CHECK-NEXT: sturh wzr, [x29, #-72]
191+
; CHECK-NEXT: msr TPIDR2_EL0, x8
192+
; CHECK-NEXT: bl __arm_sme_state
193+
; CHECK-NEXT: and x19, x0, #0x1
194+
; CHECK-NEXT: tbz x19, #0, .LBB4_2
195+
; CHECK-NEXT: // %bb.1:
196+
; CHECK-NEXT: smstop sm
197+
; CHECK-NEXT: .LBB4_2:
198+
; CHECK-NEXT: bl private_za_preserved_callee
199+
; CHECK-NEXT: tbz x19, #0, .LBB4_4
200+
; CHECK-NEXT: // %bb.3:
201+
; CHECK-NEXT: smstart sm
202+
; CHECK-NEXT: .LBB4_4:
203+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
204+
; CHECK-NEXT: sub sp, x29, #64
205+
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
206+
; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
207+
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
208+
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
209+
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
210+
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
211+
; CHECK-NEXT: ret
212+
call void @private_za_preserved_callee()
213+
ret void
214+
}

0 commit comments

Comments
 (0)