Skip to content

Commit 8fbe69a

Browse files
hazzlimtru
authored andcommitted
[AArch64] Add streaming-mode stack hazard optimization remarks (#101695)
Emit an optimization remark when objects in the stack frame may cause hazards in a streaming mode function. The analysis requires either the `aarch64-stack-hazard-size` or `aarch64-stack-hazard-remark-size` flag to be set by the user, with the former flag taking precedence. (cherry picked from commit a98a0dc)
1 parent b45f752 commit 8fbe69a

File tree

6 files changed

+364
-11
lines changed

6 files changed

+364
-11
lines changed

llvm/include/llvm/CodeGen/TargetFrameLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "llvm/ADT/BitVector.h"
1717
#include "llvm/CodeGen/MachineBasicBlock.h"
18+
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
1819
#include "llvm/Support/TypeSize.h"
1920
#include <vector>
2021

@@ -473,6 +474,11 @@ class TargetFrameLowering {
473474
/// Return the frame base information to be encoded in the DWARF subprogram
474475
/// debug info.
475476
virtual DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const;
477+
478+
/// This method is called at the end of prolog/epilog code insertion, so
479+
/// targets can emit remarks based on the final frame layout.
480+
virtual void emitRemarks(const MachineFunction &MF,
481+
MachineOptimizationRemarkEmitter *ORE) const {};
476482
};
477483

478484
} // End llvm namespace

llvm/lib/CodeGen/PrologEpilogInserter.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,9 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
341341
<< ore::NV("Function", MF.getFunction().getName()) << "'";
342342
});
343343

344+
// Emit any remarks implemented for the target, based on final frame layout.
345+
TFI->emitRemarks(MF, ORE);
346+
344347
delete RS;
345348
SaveBlocks.clear();
346349
RestoreBlocks.clear();

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 196 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@
240240
#include "llvm/Support/CommandLine.h"
241241
#include "llvm/Support/Debug.h"
242242
#include "llvm/Support/ErrorHandling.h"
243+
#include "llvm/Support/FormatVariadic.h"
243244
#include "llvm/Support/MathExtras.h"
244245
#include "llvm/Support/raw_ostream.h"
245246
#include "llvm/Target/TargetMachine.h"
@@ -275,6 +276,10 @@ cl::opt<bool> EnableHomogeneousPrologEpilog(
275276
// Stack hazard padding size. 0 = disabled.
276277
static cl::opt<unsigned> StackHazardSize("aarch64-stack-hazard-size",
277278
cl::init(0), cl::Hidden);
279+
// Stack hazard size for analysis remarks. StackHazardSize takes precedence.
280+
static cl::opt<unsigned>
281+
StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(0),
282+
cl::Hidden);
278283
// Whether to insert padding into non-streaming functions (for testing).
279284
static cl::opt<bool>
280285
StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming",
@@ -2615,9 +2620,16 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
26152620
const auto &MFI = MF.getFrameInfo();
26162621

26172622
int64_t ObjectOffset = MFI.getObjectOffset(FI);
2623+
StackOffset SVEStackSize = getSVEStackSize(MF);
2624+
2625+
// For VLA-area objects, just emit an offset at the end of the stack frame.
2626+
// Whilst not quite correct, these objects do live at the end of the frame and
2627+
// so it is more useful for analysis for the offset to reflect this.
2628+
if (MFI.isVariableSizedObjectIndex(FI)) {
2629+
return StackOffset::getFixed(-((int64_t)MFI.getStackSize())) - SVEStackSize;
2630+
}
26182631

26192632
// This is correct in the absence of any SVE stack objects.
2620-
StackOffset SVEStackSize = getSVEStackSize(MF);
26212633
if (!SVEStackSize)
26222634
return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea());
26232635

@@ -3528,13 +3540,9 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
35283540
return true;
35293541
}
35303542

3531-
// Return the FrameID for a Load/Store instruction by looking at the MMO.
3532-
static std::optional<int> getLdStFrameID(const MachineInstr &MI,
3533-
const MachineFrameInfo &MFI) {
3534-
if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
3535-
return std::nullopt;
3536-
3537-
MachineMemOperand *MMO = *MI.memoperands_begin();
3543+
// Return the FrameID for a MMO.
3544+
static std::optional<int> getMMOFrameID(MachineMemOperand *MMO,
3545+
const MachineFrameInfo &MFI) {
35383546
auto *PSV =
35393547
dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue());
35403548
if (PSV)
@@ -3552,6 +3560,15 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI,
35523560
return std::nullopt;
35533561
}
35543562

3563+
// Return the FrameID for a Load/Store instruction by looking at the first MMO.
3564+
static std::optional<int> getLdStFrameID(const MachineInstr &MI,
3565+
const MachineFrameInfo &MFI) {
3566+
if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
3567+
return std::nullopt;
3568+
3569+
return getMMOFrameID(*MI.memoperands_begin(), MFI);
3570+
}
3571+
35553572
// Check if a Hazard slot is needed for the current function, and if so create
35563573
// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
35573574
// which can be used to determine if any hazard padding is needed.
@@ -5029,3 +5046,174 @@ void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
50295046
MI->eraseFromParent();
50305047
}
50315048
}
5049+
5050+
struct StackAccess {
5051+
enum AccessType {
5052+
NotAccessed = 0, // Stack object not accessed by load/store instructions.
5053+
GPR = 1 << 0, // A general purpose register.
5054+
PPR = 1 << 1, // A predicate register.
5055+
FPR = 1 << 2, // A floating point/Neon/SVE register.
5056+
};
5057+
5058+
int Idx;
5059+
StackOffset Offset;
5060+
int64_t Size;
5061+
unsigned AccessTypes;
5062+
5063+
StackAccess() : Idx(0), Offset(), Size(0), AccessTypes(NotAccessed) {}
5064+
5065+
bool operator<(const StackAccess &Rhs) const {
5066+
return std::make_tuple(start(), Idx) <
5067+
std::make_tuple(Rhs.start(), Rhs.Idx);
5068+
}
5069+
5070+
bool isCPU() const {
5071+
// Predicate register load and store instructions execute on the CPU.
5072+
return AccessTypes & (AccessType::GPR | AccessType::PPR);
5073+
}
5074+
bool isSME() const { return AccessTypes & AccessType::FPR; }
5075+
bool isMixed() const { return isCPU() && isSME(); }
5076+
5077+
int64_t start() const { return Offset.getFixed() + Offset.getScalable(); }
5078+
int64_t end() const { return start() + Size; }
5079+
5080+
std::string getTypeString() const {
5081+
switch (AccessTypes) {
5082+
case AccessType::FPR:
5083+
return "FPR";
5084+
case AccessType::PPR:
5085+
return "PPR";
5086+
case AccessType::GPR:
5087+
return "GPR";
5088+
case AccessType::NotAccessed:
5089+
return "NA";
5090+
default:
5091+
return "Mixed";
5092+
}
5093+
}
5094+
5095+
void print(raw_ostream &OS) const {
5096+
OS << getTypeString() << " stack object at [SP"
5097+
<< (Offset.getFixed() < 0 ? "" : "+") << Offset.getFixed();
5098+
if (Offset.getScalable())
5099+
OS << (Offset.getScalable() < 0 ? "" : "+") << Offset.getScalable()
5100+
<< " * vscale";
5101+
OS << "]";
5102+
}
5103+
};
5104+
5105+
static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) {
5106+
SA.print(OS);
5107+
return OS;
5108+
}
5109+
5110+
void AArch64FrameLowering::emitRemarks(
5111+
const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const {
5112+
5113+
SMEAttrs Attrs(MF.getFunction());
5114+
if (Attrs.hasNonStreamingInterfaceAndBody())
5115+
return;
5116+
5117+
const uint64_t HazardSize =
5118+
(StackHazardSize) ? StackHazardSize : StackHazardRemarkSize;
5119+
5120+
if (HazardSize == 0)
5121+
return;
5122+
5123+
const MachineFrameInfo &MFI = MF.getFrameInfo();
5124+
// Bail if function has no stack objects.
5125+
if (!MFI.hasStackObjects())
5126+
return;
5127+
5128+
std::vector<StackAccess> StackAccesses(MFI.getNumObjects());
5129+
5130+
size_t NumFPLdSt = 0;
5131+
size_t NumNonFPLdSt = 0;
5132+
5133+
// Collect stack accesses via Load/Store instructions.
5134+
for (const MachineBasicBlock &MBB : MF) {
5135+
for (const MachineInstr &MI : MBB) {
5136+
if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
5137+
continue;
5138+
for (MachineMemOperand *MMO : MI.memoperands()) {
5139+
std::optional<int> FI = getMMOFrameID(MMO, MFI);
5140+
if (FI && !MFI.isDeadObjectIndex(*FI)) {
5141+
int FrameIdx = *FI;
5142+
5143+
size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects();
5144+
if (StackAccesses[ArrIdx].AccessTypes == StackAccess::NotAccessed) {
5145+
StackAccesses[ArrIdx].Idx = FrameIdx;
5146+
StackAccesses[ArrIdx].Offset =
5147+
getFrameIndexReferenceFromSP(MF, FrameIdx);
5148+
StackAccesses[ArrIdx].Size = MFI.getObjectSize(FrameIdx);
5149+
}
5150+
5151+
unsigned RegTy = StackAccess::AccessType::GPR;
5152+
if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
5153+
if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
5154+
RegTy = StackAccess::PPR;
5155+
else
5156+
RegTy = StackAccess::FPR;
5157+
} else if (AArch64InstrInfo::isFpOrNEON(MI)) {
5158+
RegTy = StackAccess::FPR;
5159+
}
5160+
5161+
StackAccesses[ArrIdx].AccessTypes |= RegTy;
5162+
5163+
if (RegTy == StackAccess::FPR)
5164+
++NumFPLdSt;
5165+
else
5166+
++NumNonFPLdSt;
5167+
}
5168+
}
5169+
}
5170+
}
5171+
5172+
if (NumFPLdSt == 0 || NumNonFPLdSt == 0)
5173+
return;
5174+
5175+
llvm::sort(StackAccesses);
5176+
StackAccesses.erase(llvm::remove_if(StackAccesses,
5177+
[](const StackAccess &S) {
5178+
return S.AccessTypes ==
5179+
StackAccess::NotAccessed;
5180+
}),
5181+
StackAccesses.end());
5182+
5183+
SmallVector<const StackAccess *> MixedObjects;
5184+
SmallVector<std::pair<const StackAccess *, const StackAccess *>> HazardPairs;
5185+
5186+
if (StackAccesses.front().isMixed())
5187+
MixedObjects.push_back(&StackAccesses.front());
5188+
5189+
for (auto It = StackAccesses.begin(), End = std::prev(StackAccesses.end());
5190+
It != End; ++It) {
5191+
const auto &First = *It;
5192+
const auto &Second = *(It + 1);
5193+
5194+
if (Second.isMixed())
5195+
MixedObjects.push_back(&Second);
5196+
5197+
if ((First.isSME() && Second.isCPU()) ||
5198+
(First.isCPU() && Second.isSME())) {
5199+
uint64_t Distance = static_cast<uint64_t>(Second.start() - First.end());
5200+
if (Distance < HazardSize)
5201+
HazardPairs.emplace_back(&First, &Second);
5202+
}
5203+
}
5204+
5205+
auto EmitRemark = [&](llvm::StringRef Str) {
5206+
ORE->emit([&]() {
5207+
auto R = MachineOptimizationRemarkAnalysis(
5208+
"sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front());
5209+
return R << formatv("stack hazard in '{0}': ", MF.getName()).str() << Str;
5210+
});
5211+
};
5212+
5213+
for (const auto &P : HazardPairs)
5214+
EmitRemark(formatv("{0} is too close to {1}", *P.first, *P.second).str());
5215+
5216+
for (const auto *Obj : MixedObjects)
5217+
EmitRemark(
5218+
formatv("{0} accessed by both GP and FP instructions", *Obj).str());
5219+
}

llvm/lib/Target/AArch64/AArch64FrameLowering.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
1414
#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
1515

16-
#include "llvm/Support/TypeSize.h"
16+
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
1717
#include "llvm/CodeGen/TargetFrameLowering.h"
18+
#include "llvm/Support/TypeSize.h"
1819

1920
namespace llvm {
2021

@@ -178,6 +179,9 @@ class AArch64FrameLowering : public TargetFrameLowering {
178179
inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,
179180
int64_t NegProbeSize,
180181
Register TargetReg) const;
182+
183+
void emitRemarks(const MachineFunction &MF,
184+
MachineOptimizationRemarkEmitter *ORE) const override;
181185
};
182186

183187
} // End llvm namespace

0 commit comments

Comments
 (0)