Skip to content

[AArch64][MachinePipeliner] Add pipeliner support for AArch64 #79589

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9608,6 +9608,111 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
return ExitMBB->begin();
}

namespace {
class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
MachineInstr *PredBranch;
SmallVector<MachineOperand, 4> Cond;

public:
AArch64PipelinerLoopInfo(MachineInstr *PredBranch,
const SmallVectorImpl<MachineOperand> &Cond)
: PredBranch(PredBranch), Cond(Cond.begin(), Cond.end()) {}

bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
// Make the instructions for loop control be placed in stage 0.
// The predecessors of PredBranch are considered by the caller.
return MI == PredBranch;
}

std::optional<bool> createTripCountGreaterCondition(
int TC, MachineBasicBlock &MBB,
SmallVectorImpl<MachineOperand> &CondParam) override {
// A branch instruction will be inserted as "if (Cond) goto epilogue".
// Cond is normalized for such use.
// The predecessors of the branch are assumed to have already been inserted.
CondParam = Cond;
return {};
}

void setPreheader(MachineBasicBlock *NewPreheader) override {}

void adjustTripCount(int TripCountAdjust) override {}

void disposed() override {}
};
} // namespace

static bool isCompareAndBranch(unsigned Opcode) {
switch (Opcode) {
case AArch64::CBZW:
case AArch64::CBZX:
case AArch64::CBNZW:
case AArch64::CBNZX:
case AArch64::TBZW:
case AArch64::TBZX:
case AArch64::TBNZW:
case AArch64::TBNZX:
return true;
}
return false;
}

std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 4> Cond;
if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
return nullptr;

// Infinite loops are not supported
if (TBB == LoopBB && FBB == LoopBB)
return nullptr;

// Must be conditional branch
if (FBB == nullptr)
return nullptr;

assert((TBB == LoopBB || FBB == LoopBB) &&
"The Loop must be a single-basic-block loop");

// Normalization for createTripCountGreaterCondition()
if (TBB == LoopBB)
reverseBranchCondition(Cond);

MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
const TargetRegisterInfo &TRI = getRegisterInfo();

// Find the immediate predecessor of the conditional branch
MachineInstr *PredBranch = nullptr;
if (CondBranch->getOpcode() == AArch64::Bcc) {
for (MachineInstr &MI : reverse(*LoopBB)) {
if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
PredBranch = &MI;
break;
}
}
if (!PredBranch)
return nullptr;
} else if (isCompareAndBranch(CondBranch->getOpcode())) {
const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
Register Reg = CondBranch->getOperand(0).getReg();
if (!Reg.isVirtual())
return nullptr;
PredBranch = MRI.getVRegDef(Reg);

// MachinePipeliner does not expect that the immediate predecessor is a Phi
if (PredBranch->isPHI())
return nullptr;

if (PredBranch->getParent() != LoopBB)
return nullptr;
} else {
return nullptr;
}

return std::make_unique<AArch64PipelinerLoopInfo>(PredBranch, Cond);
}

#define GET_INSTRINFO_HELPERS
#define GET_INSTRMAP_INFO
#include "AArch64GenInstrInfo.inc"
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;

std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;

bool
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Subtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -540,3 +540,7 @@ AArch64Subtarget::getAuthenticatedLRCheckMethod() const {
// performance regression or incompatibility with execute-only mappings.
return AArch64PAuth::AuthCheckMethod::None;
}

bool AArch64Subtarget::enableMachinePipeliner() const {
return getSchedModel().hasInstrSchedModel();
}
3 changes: 3 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override { return usePostRAScheduler(); }

bool enableMachinePipeliner() const override;
bool useDFAforSMS() const override { return false; }

/// Returns ARM processor family.
/// Avoid this function! CPU specifics should be kept local to this class
/// and preferably modeled with SubtargetFeatures or properties in
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@ static cl::opt<bool>
cl::desc("Enable sinking and folding of instruction copies"),
cl::init(true), cl::Hidden);

static cl::opt<bool>
EnableMachinePipeliner("aarch64-enable-pipeliner",
cl::desc("Enable Machine Pipeliner for AArch64"),
cl::init(false), cl::Hidden);

extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
// Register the target.
RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
Expand Down Expand Up @@ -779,6 +784,8 @@ void AArch64PassConfig::addPreRegAlloc() {
// be register coalescer friendly.
addPass(&PeepholeOptimizerID);
}
if (TM->getOptLevel() != CodeGenOptLevel::None && EnableMachinePipeliner)
addPass(&MachinePipelinerID);
}

void AArch64PassConfig::addPostRegAlloc() {
Expand Down
78 changes: 78 additions & 0 deletions llvm/test/CodeGen/AArch64/sms-acceptable-loop1.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s

# An acceptable loop by pipeliner: TBB == ExitBB, FBB == LoopBB, Branch with NZCV flags
# CHECK: Schedule Found? 1

--- |
define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
%wide.trip.count = zext nneg i32 %n to i64
br label %for.body

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %for.body.preheader, %for.body
%lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
%lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ]
%lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ]
%0 = load float, ptr %lsr.iv9, align 4
%add = fadd float %0, 1.000000e+00
store float %add, ptr %lsr.iv, align 4
%scevgep = getelementptr i8, ptr %lsr.iv, i64 4
%scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4
%lsr.iv.next = add nsw i64 %lsr.iv11, -1
%exitcond.not = icmp eq i64 %lsr.iv.next, 0
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}

...
---
name: func
tracksRegLiveness: true
liveins:
- { reg: '$x0', virtual-reg: '%7' }
- { reg: '$x1', virtual-reg: '%8' }
- { reg: '$w2', virtual-reg: '%9' }
body: |
bb.0.entry:
successors: %bb.1(0x50000000), %bb.2(0x30000000)
liveins: $x0, $x1, $w2

%9:gpr32common = COPY $w2
%8:gpr64 = COPY $x1
%7:gpr64 = COPY $x0
dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
Bcc 11, %bb.2, implicit $nzcv
B %bb.1

bb.1.for.body.preheader:
%11:gpr32 = ORRWrs $wzr, %9, 0
%0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32
%14:fpr32 = FMOVSi 112
B %bb.3

bb.2.for.cond.cleanup:
RET_ReallyLR

bb.3.for.body:
successors: %bb.2(0x04000000), %bb.3(0x7c000000)

%1:gpr64sp = PHI %0, %bb.1, %6, %bb.3
%2:gpr64sp = PHI %8, %bb.1, %5, %bb.3
%3:gpr64sp = PHI %7, %bb.1, %4, %bb.3
early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9)
%15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr
early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv)
%4:gpr64all = COPY %16
%5:gpr64all = COPY %12
%17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv
%6:gpr64all = COPY %17
Bcc 0, %bb.2, implicit $nzcv
B %bb.3

...
78 changes: 78 additions & 0 deletions llvm/test/CodeGen/AArch64/sms-acceptable-loop2.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s

# An acceptable loop by pipeliner: TBB == LoopBB, FBB == ExitBB, Branch with NZCV flags
# CHECK: Schedule Found? 1

--- |
define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
%wide.trip.count = zext nneg i32 %n to i64
br label %for.body

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %for.body.preheader, %for.body
%lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
%lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ]
%lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ]
%0 = load float, ptr %lsr.iv9, align 4
%add = fadd float %0, 1.000000e+00
store float %add, ptr %lsr.iv, align 4
%scevgep = getelementptr i8, ptr %lsr.iv, i64 4
%scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4
%lsr.iv.next = add nsw i64 %lsr.iv11, -1
%exitcond.not = icmp eq i64 %lsr.iv.next, 0
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}

...
---
name: func
tracksRegLiveness: true
liveins:
- { reg: '$x0', virtual-reg: '%7' }
- { reg: '$x1', virtual-reg: '%8' }
- { reg: '$w2', virtual-reg: '%9' }
body: |
bb.0.entry:
successors: %bb.1(0x50000000), %bb.2(0x30000000)
liveins: $x0, $x1, $w2

%9:gpr32common = COPY $w2
%8:gpr64 = COPY $x1
%7:gpr64 = COPY $x0
dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
Bcc 11, %bb.2, implicit $nzcv
B %bb.1

bb.1.for.body.preheader:
%11:gpr32 = ORRWrs $wzr, %9, 0
%0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32
%14:fpr32 = FMOVSi 112
B %bb.3

bb.2.for.cond.cleanup:
RET_ReallyLR

bb.3.for.body:
successors: %bb.2(0x04000000), %bb.3(0x7c000000)

%1:gpr64sp = PHI %0, %bb.1, %6, %bb.3
%2:gpr64sp = PHI %8, %bb.1, %5, %bb.3
%3:gpr64sp = PHI %7, %bb.1, %4, %bb.3
early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9)
%15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr
early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv)
%4:gpr64all = COPY %16
%5:gpr64all = COPY %12
%17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv
%6:gpr64all = COPY %17
Bcc 1, %bb.3, implicit $nzcv
B %bb.2

...
79 changes: 79 additions & 0 deletions llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s

# An acceptable loop by pipeliner: TBB == ExitBB, FBB == LoopBB, Compare and branch
# CHECK: Schedule Found? 1

--- |
define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
entry:
%or.cond = icmp ult i32 %n, 2
br i1 %or.cond, label %for.end, label %for.body.preheader

for.body.preheader: ; preds = %entry
%i.07 = add i32 %n, -1
%0 = sext i32 %i.07 to i64
br label %for.body

for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%1 = shl nsw i64 %indvars.iv, 2
%scevgep = getelementptr i8, ptr %b, i64 %1
%2 = load float, ptr %scevgep, align 4
%add = fadd float %2, 1.000000e+00
%3 = shl nsw i64 %indvars.iv, 2
%scevgep11 = getelementptr i8, ptr %a, i64 %3
store float %add, ptr %scevgep11, align 4
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%4 = add i64 %indvars.iv, -1
%5 = and i64 %4, 4294967295
%tobool.not = icmp eq i64 %5, 0
br i1 %tobool.not, label %for.end, label %for.body

for.end: ; preds = %for.body, %entry
ret void
}

...
---
name: func
tracksRegLiveness: true
liveins:
- { reg: '$x0', virtual-reg: '%3' }
- { reg: '$x1', virtual-reg: '%4' }
- { reg: '$w2', virtual-reg: '%5' }
body: |
bb.0.entry:
liveins: $x0, $x1, $w2

%5:gpr32common = COPY $w2
%4:gpr64common = COPY $x1
%3:gpr64common = COPY $x0
dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
Bcc 3, %bb.3, implicit $nzcv
B %bb.1

bb.1.for.body.preheader:
%7:gpr32common = SUBWri %5, 1, 0
%9:gpr64all = IMPLICIT_DEF
%8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
%10:gpr64 = SBFMXri killed %8, 0, 31
%0:gpr64all = COPY %10
%12:fpr32 = FMOVSi 112

bb.2.for.body:
successors: %bb.3(0x04000000), %bb.2(0x7c000000)

%1:gpr64common = PHI %0, %bb.1, %2, %bb.2
%11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
%13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
%14:gpr64common = SUBXri %1, 1, 0
%2:gpr64all = COPY %14
%15:gpr32 = COPY %14.sub_32
CBZW killed %15, %bb.3
B %bb.2

bb.3.for.end:
RET_ReallyLR

...
Loading