Skip to content

Commit db054a1

Browse files
[AArch64][SME] Fix ADDVL addressing to scavenged stackslot. (#109674)
In https://reviews.llvm.org/D159196 we avoided stackslot scavenging when there was no FP available. But in the case where FP is available we need to actually prefer using the FP over the BP. This change affects more than just SME, but it should be a general improvement, since any slot above the (address pointed to by) FP is always closer to FP than BP, so it makes sense to always favour using the FP to address it when the FP is available. This also fixes the issue for SME where this is not just preferred but required.
1 parent 8ba334b commit db054a1

File tree

2 files changed

+50
-6
lines changed

2 files changed

+50
-6
lines changed

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2757,7 +2757,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
27572757
bool FPOffsetFits = !ForSimm || FPOffset >= -256;
27582758
PreferFP |= Offset > -FPOffset && !SVEStackSize;
27592759

2760-
if (MFI.hasVarSizedObjects()) {
2760+
if (FPOffset >= 0) {
2761+
// If the FPOffset is positive, that'll always be best, as the SP/BP
2762+
// will be even further away.
2763+
UseFP = true;
2764+
} else if (MFI.hasVarSizedObjects()) {
27612765
// If we have variable sized objects, we can use either FP or BP, as the
27622766
// SP offset is unknown. We can use the base pointer if we have one and
27632767
// FP is not preferred. If not, we're stuck with using FP.
@@ -2769,11 +2773,6 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
27692773
// else we can use BP and FP, but the offset from FP won't fit.
27702774
// That will make us scavenge registers which we can probably avoid by
27712775
// using BP. If it won't fit for BP either, we'll scavenge anyway.
2772-
} else if (FPOffset >= 0) {
2773-
// Use SP or FP, whichever gives us the best chance of the offset
2774-
// being in range for direct access. If the FPOffset is positive,
2775-
// that'll always be best, as the SP will be even further away.
2776-
UseFP = true;
27772776
} else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
27782777
// Funclets access the locals contained in the parent's stack frame
27792778
// via the frame pointer, so we have to use the FP in the parent

llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,51 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
4545
ret void
4646
}
4747

48+
define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-pointer"="all" {
49+
; CHECK-LABEL: test_no_stackslot_scavenging_with_fp:
50+
; CHECK: // %bb.0:
51+
; CHECK-NEXT: stp d15, d14, [sp, #-128]! // 16-byte Folded Spill
52+
; CHECK-NEXT: cntd x9
53+
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
54+
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
55+
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
56+
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
57+
; CHECK-NEXT: add x29, sp, #64
58+
; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
59+
; CHECK-NEXT: stp x28, x25, [sp, #96] // 16-byte Folded Spill
60+
; CHECK-NEXT: stp x24, x19, [sp, #112] // 16-byte Folded Spill
61+
; CHECK-NEXT: addvl sp, sp, #-1
62+
; CHECK-NEXT: lsl x9, x0, #3
63+
; CHECK-NEXT: mov x8, sp
64+
; CHECK-NEXT: mov x19, sp
65+
; CHECK-NEXT: str s0, [x29, #28] // 4-byte Folded Spill
66+
; CHECK-NEXT: add x9, x9, #15
67+
; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
68+
; CHECK-NEXT: sub x8, x8, x9
69+
; CHECK-NEXT: mov sp, x8
70+
; CHECK-NEXT: //APP
71+
; CHECK-NEXT: //NO_APP
72+
; CHECK-NEXT: smstop sm
73+
; CHECK-NEXT: ldr s0, [x29, #28] // 4-byte Folded Reload
74+
; CHECK-NEXT: bl use_f
75+
; CHECK-NEXT: smstart sm
76+
; CHECK-NEXT: sub sp, x29, #64
77+
; CHECK-NEXT: ldp x24, x19, [sp, #112] // 16-byte Folded Reload
78+
; CHECK-NEXT: ldp x28, x25, [sp, #96] // 16-byte Folded Reload
79+
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
80+
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
81+
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
82+
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
83+
; CHECK-NEXT: ldp d15, d14, [sp], #128 // 16-byte Folded Reload
84+
; CHECK-NEXT: ret
85+
%ptr2 = alloca i64, i64 %n, align 8
86+
%ptr = alloca <vscale x 16 x i8>
87+
call void asm sideeffect "", "~{x24},~{x25}"() nounwind
88+
call void @use_f(float %f)
89+
ret void
90+
}
91+
4892
declare void @use_f(float)
93+
declare void @use_f_and_ptr(float, ptr)
4994

5095
attributes #0 = { nounwind "target-features"="+sve,+sme" "aarch64_pstate_sm_enabled" }

0 commit comments

Comments
 (0)