-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64][SME] Fix ADDVL addressing to scavenged stackslot. #109674
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SME] Fix ADDVL addressing to scavenged stackslot. #109674
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Sander de Smalen (sdesmalen-arm) ChangesIn https://reviews.llvm.org/D159196 we avoided stackslot scavenging This change affects more than just SME, but it should be a general This also fixes the issue for SME where this is not just preferred Full diff: https://github.com/llvm/llvm-project/pull/109674.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 7e041b086599b9..30c3a886c40453 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2757,7 +2757,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
bool FPOffsetFits = !ForSimm || FPOffset >= -256;
PreferFP |= Offset > -FPOffset && !SVEStackSize;
- if (MFI.hasVarSizedObjects()) {
+ if (FPOffset >= 0) {
+ // If the FPOffset is positive, that'll always be best, as the SP/BP
+ // will be even further away.
+ UseFP = true;
+ } if (MFI.hasVarSizedObjects()) {
// If we have variable sized objects, we can use either FP or BP, as the
// SP offset is unknown. We can use the base pointer if we have one and
// FP is not preferred. If not, we're stuck with using FP.
@@ -2769,11 +2773,6 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
// else we can use BP and FP, but the offset from FP won't fit.
// That will make us scavenge registers which we can probably avoid by
// using BP. If it won't fit for BP either, we'll scavenge anyway.
- } else if (FPOffset >= 0) {
- // Use SP or FP, whichever gives us the best chance of the offset
- // being in range for direct access. If the FPOffset is positive,
- // that'll always be best, as the SP will be even further away.
- UseFP = true;
} else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
// Funclets access the locals contained in the parent's stack frame
// via the frame pointer, so we have to use the FP in the parent
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index ac19bd59babe46..803bb9fda458b9 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -45,6 +45,51 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
ret void
}
+define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-pointer"="all" {
+; CHECK-LABEL: test_no_stackslot_scavenging_with_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-128]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x28, x25, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: stp x24, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: lsl x9, x0, #3
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: str s0, [x29, #28] // 4-byte Folded Spill
+; CHECK-NEXT: add x9, x9, #15
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldr s0, [x29, #28] // 4-byte Folded Reload
+; CHECK-NEXT: bl use_f
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x24, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x28, x25, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #128 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ptr2 = alloca i64, i64 %n, align 8
+ %ptr = alloca <vscale x 16 x i8>
+ call void asm sideeffect "", "~{x24},~{x25}"() nounwind
+ call void @use_f(float %f)
+ ret void
+}
+
declare void @use_f(float)
+declare void @use_f_and_ptr(float, ptr)
attributes #0 = { nounwind "target-features"="+sve,+sme" "aarch64_pstate_sm_enabled" }
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
In https://reviews.llvm.org/D159196 we avoided stackslot scavenging when there was no FP available. But in the case where FP is available we need to actually prefer using the FP over the BP. This change affects more than just SME, but in general it should be an improvement, since any slot above the (address pointed to by) FP is always closer to FP than BP, so it makes sense to favour using the FP to address it. This also fixes the issue for SME, where this is not just preferred but required.
0e279fd
to
5571e53
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
In https://reviews.llvm.org/D159196 we avoided stackslot scavenging
when there was no FP available. But in the case where FP is available
we need to actually prefer using the FP over the BP.
This change affects more than just SME, but it should be a general
improvement, since any slot above the (address pointed to by) FP
is always closer to FP than BP, so it makes sense to always favour
using the FP to address it when the FP is available.
This also fixes the issue for SME where this is not just preferred
but required.