Skip to content

Commit 83dbce0

Browse files
committed
[AArch64][MachinePipeliner] Add pipeliner support for AArch64
Add AArch64 implementations for the interfaces of MachinePipeliner pass. The pass is disabled by default for AArch64. It is enabled by specifying --aarch64-enable-pipeliner. 5 tests in llvm-test-suites show performance improvement by more than 5% on a Neoverse V1 processor. | test | improvement | | ---------------------------------------------------------------- | -----------:| | MultiSource/Benchmarks/TSVC/Recurrences-dbl/Recurrences-dbl.test | 16% | | MultiSource/Benchmarks/TSVC/Recurrences-dbl/Recurrences-flt.test | 16% | | SingleSource/Benchmarks/Adobe-C++/loop_unroll.test | 14% | | SingleSource/Benchmarks/Misc/flops-5.test | 13% | | SingleSource/Benchmarks/BenchmarkGame/spectral-norm.test | 6% | (base flags: -mcpu=neoverse-v1 -O3 -mrecip, flags for pipelining: -mllvm -aarch64-enable-pipeliner -mllvm -pipeliner-max-stages=100 -mllvm -pipeliner-max-mii=100 -mllvm -pipeliner-enable-copytophi=0) On the other hand, there are cases of significant performance degradation. Algorithm improvements and adding the option/pragma will be needed in the future.
1 parent ff31940 commit 83dbce0

13 files changed

+761
-0
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9608,6 +9608,111 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
96089608
return ExitMBB->begin();
96099609
}
96109610

9611+
namespace {
9612+
class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
9613+
MachineInstr *PredBranch;
9614+
SmallVector<MachineOperand, 4> Cond;
9615+
9616+
public:
9617+
AArch64PipelinerLoopInfo(MachineInstr *PredBranch,
9618+
const SmallVectorImpl<MachineOperand> &Cond)
9619+
: PredBranch(PredBranch), Cond(Cond.begin(), Cond.end()) {}
9620+
9621+
bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
9622+
// Make the instructions for loop control be placed in stage 0.
9623+
// The predecessors of PredBranch are considered by the caller.
9624+
return MI == PredBranch;
9625+
}
9626+
9627+
std::optional<bool> createTripCountGreaterCondition(
9628+
int TC, MachineBasicBlock &MBB,
9629+
SmallVectorImpl<MachineOperand> &CondParam) override {
9630+
// A branch instruction will be inserted as "if (Cond) goto epilogue".
9631+
// Cond is normalized for such use.
9632+
// The predecessors of the branch are assumed to have already been inserted.
9633+
CondParam = Cond;
9634+
return {};
9635+
}
9636+
9637+
void setPreheader(MachineBasicBlock *NewPreheader) override {}
9638+
9639+
void adjustTripCount(int TripCountAdjust) override {}
9640+
9641+
void disposed() override {}
9642+
};
9643+
} // namespace
9644+
9645+
static bool isCompareAndBranch(unsigned Opcode) {
9646+
switch (Opcode) {
9647+
case AArch64::CBZW:
9648+
case AArch64::CBZX:
9649+
case AArch64::CBNZW:
9650+
case AArch64::CBNZX:
9651+
case AArch64::TBZW:
9652+
case AArch64::TBZX:
9653+
case AArch64::TBNZW:
9654+
case AArch64::TBNZX:
9655+
return true;
9656+
}
9657+
return false;
9658+
}
9659+
9660+
std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
9661+
AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
9662+
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
9663+
SmallVector<MachineOperand, 4> Cond;
9664+
if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
9665+
return nullptr;
9666+
9667+
// Infinite loops are not supported
9668+
if (TBB == LoopBB && FBB == LoopBB)
9669+
return nullptr;
9670+
9671+
// Must be conditional branch
9672+
if (FBB == nullptr)
9673+
return nullptr;
9674+
9675+
assert((TBB == LoopBB || FBB == LoopBB) &&
9676+
"The Loop must be a single-basic-block loop");
9677+
9678+
// Normalization for createTripCountGreaterCondition()
9679+
if (TBB == LoopBB)
9680+
reverseBranchCondition(Cond);
9681+
9682+
MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
9683+
const TargetRegisterInfo &TRI = getRegisterInfo();
9684+
9685+
// Find the immediate predecessor of the conditional branch
9686+
MachineInstr *PredBranch = nullptr;
9687+
if (CondBranch->getOpcode() == AArch64::Bcc) {
9688+
for (MachineInstr &MI : reverse(*LoopBB)) {
9689+
if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
9690+
PredBranch = &MI;
9691+
break;
9692+
}
9693+
}
9694+
if (!PredBranch)
9695+
return nullptr;
9696+
} else if (isCompareAndBranch(CondBranch->getOpcode())) {
9697+
const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
9698+
Register Reg = CondBranch->getOperand(0).getReg();
9699+
if (!Reg.isVirtual())
9700+
return nullptr;
9701+
PredBranch = MRI.getVRegDef(Reg);
9702+
9703+
// MachinePipeliner does not expect that the immediate predecessor is a Phi
9704+
if (PredBranch->isPHI())
9705+
return nullptr;
9706+
9707+
if (PredBranch->getParent() != LoopBB)
9708+
return nullptr;
9709+
} else {
9710+
return nullptr;
9711+
}
9712+
9713+
return std::make_unique<AArch64PipelinerLoopInfo>(PredBranch, Cond);
9714+
}
9715+
96119716
#define GET_INSTRINFO_HELPERS
96129717
#define GET_INSTRMAP_INFO
96139718
#include "AArch64GenInstrInfo.inc"

llvm/lib/Target/AArch64/AArch64InstrInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
247247
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
248248
const DebugLoc &DL,
249249
int *BytesAdded = nullptr) const override;
250+
251+
std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
252+
analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
253+
250254
bool
251255
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
252256
bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,3 +540,7 @@ AArch64Subtarget::getAuthenticatedLRCheckMethod() const {
540540
// performance regression or incompatibility with execute-only mappings.
541541
return AArch64PAuth::AuthCheckMethod::None;
542542
}
543+
544+
bool AArch64Subtarget::enableMachinePipeliner() const {
545+
return getSchedModel().hasInstrSchedModel();
546+
}

llvm/lib/Target/AArch64/AArch64Subtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
201201
bool enableMachineScheduler() const override { return true; }
202202
bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
203203

204+
bool enableMachinePipeliner() const override;
205+
bool useDFAforSMS() const override { return false; }
206+
204207
/// Returns ARM processor family.
205208
/// Avoid this function! CPU specifics should be kept local to this class
206209
/// and preferably modeled with SubtargetFeatures or properties in

llvm/lib/Target/AArch64/AArch64TargetMachine.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,11 @@ static cl::opt<bool>
204204
cl::desc("Enable sinking and folding of instruction copies"),
205205
cl::init(true), cl::Hidden);
206206

207+
static cl::opt<bool>
208+
EnableMachinePipeliner("aarch64-enable-pipeliner",
209+
cl::desc("Enable Machine Pipeliner for AArch64"),
210+
cl::init(false), cl::Hidden);
211+
207212
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
208213
// Register the target.
209214
RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -779,6 +784,8 @@ void AArch64PassConfig::addPreRegAlloc() {
779784
// be register coalescer friendly.
780785
addPass(&PeepholeOptimizerID);
781786
}
787+
if (TM->getOptLevel() != CodeGenOptLevel::None && EnableMachinePipeliner)
788+
addPass(&MachinePipelinerID);
782789
}
783790

784791
void AArch64PassConfig::addPostRegAlloc() {
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s
2+
3+
# An acceptable loop by pipeliner: TBB == ExitBB, FBB == LoopBB, Branch with NZCV flags
4+
# CHECK: Schedule Found? 1
5+
6+
--- |
7+
define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
8+
entry:
9+
%cmp6 = icmp sgt i32 %n, 0
10+
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
11+
12+
for.body.preheader: ; preds = %entry
13+
%wide.trip.count = zext nneg i32 %n to i64
14+
br label %for.body
15+
16+
for.cond.cleanup: ; preds = %for.body, %entry
17+
ret void
18+
19+
for.body: ; preds = %for.body.preheader, %for.body
20+
%lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
21+
%lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ]
22+
%lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ]
23+
%0 = load float, ptr %lsr.iv9, align 4
24+
%add = fadd float %0, 1.000000e+00
25+
store float %add, ptr %lsr.iv, align 4
26+
%scevgep = getelementptr i8, ptr %lsr.iv, i64 4
27+
%scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4
28+
%lsr.iv.next = add nsw i64 %lsr.iv11, -1
29+
%exitcond.not = icmp eq i64 %lsr.iv.next, 0
30+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
31+
}
32+
33+
...
34+
---
35+
name: func
36+
tracksRegLiveness: true
37+
liveins:
38+
- { reg: '$x0', virtual-reg: '%7' }
39+
- { reg: '$x1', virtual-reg: '%8' }
40+
- { reg: '$w2', virtual-reg: '%9' }
41+
body: |
42+
bb.0.entry:
43+
successors: %bb.1(0x50000000), %bb.2(0x30000000)
44+
liveins: $x0, $x1, $w2
45+
46+
%9:gpr32common = COPY $w2
47+
%8:gpr64 = COPY $x1
48+
%7:gpr64 = COPY $x0
49+
dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
50+
Bcc 11, %bb.2, implicit $nzcv
51+
B %bb.1
52+
53+
bb.1.for.body.preheader:
54+
%11:gpr32 = ORRWrs $wzr, %9, 0
55+
%0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32
56+
%14:fpr32 = FMOVSi 112
57+
B %bb.3
58+
59+
bb.2.for.cond.cleanup:
60+
RET_ReallyLR
61+
62+
bb.3.for.body:
63+
successors: %bb.2(0x04000000), %bb.3(0x7c000000)
64+
65+
%1:gpr64sp = PHI %0, %bb.1, %6, %bb.3
66+
%2:gpr64sp = PHI %8, %bb.1, %5, %bb.3
67+
%3:gpr64sp = PHI %7, %bb.1, %4, %bb.3
68+
early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9)
69+
%15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr
70+
early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv)
71+
%4:gpr64all = COPY %16
72+
%5:gpr64all = COPY %12
73+
%17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv
74+
%6:gpr64all = COPY %17
75+
Bcc 0, %bb.2, implicit $nzcv
76+
B %bb.3
77+
78+
...
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s
2+
3+
# An acceptable loop by pipeliner: TBB == LoopBB, FBB == ExitBB, Branch with NZCV flags
4+
# CHECK: Schedule Found? 1
5+
6+
--- |
7+
define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
8+
entry:
9+
%cmp6 = icmp sgt i32 %n, 0
10+
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
11+
12+
for.body.preheader: ; preds = %entry
13+
%wide.trip.count = zext nneg i32 %n to i64
14+
br label %for.body
15+
16+
for.cond.cleanup: ; preds = %for.body, %entry
17+
ret void
18+
19+
for.body: ; preds = %for.body.preheader, %for.body
20+
%lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
21+
%lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ]
22+
%lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ]
23+
%0 = load float, ptr %lsr.iv9, align 4
24+
%add = fadd float %0, 1.000000e+00
25+
store float %add, ptr %lsr.iv, align 4
26+
%scevgep = getelementptr i8, ptr %lsr.iv, i64 4
27+
%scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4
28+
%lsr.iv.next = add nsw i64 %lsr.iv11, -1
29+
%exitcond.not = icmp eq i64 %lsr.iv.next, 0
30+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
31+
}
32+
33+
...
34+
---
35+
name: func
36+
tracksRegLiveness: true
37+
liveins:
38+
- { reg: '$x0', virtual-reg: '%7' }
39+
- { reg: '$x1', virtual-reg: '%8' }
40+
- { reg: '$w2', virtual-reg: '%9' }
41+
body: |
42+
bb.0.entry:
43+
successors: %bb.1(0x50000000), %bb.2(0x30000000)
44+
liveins: $x0, $x1, $w2
45+
46+
%9:gpr32common = COPY $w2
47+
%8:gpr64 = COPY $x1
48+
%7:gpr64 = COPY $x0
49+
dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
50+
Bcc 11, %bb.2, implicit $nzcv
51+
B %bb.1
52+
53+
bb.1.for.body.preheader:
54+
%11:gpr32 = ORRWrs $wzr, %9, 0
55+
%0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32
56+
%14:fpr32 = FMOVSi 112
57+
B %bb.3
58+
59+
bb.2.for.cond.cleanup:
60+
RET_ReallyLR
61+
62+
bb.3.for.body:
63+
successors: %bb.2(0x04000000), %bb.3(0x7c000000)
64+
65+
%1:gpr64sp = PHI %0, %bb.1, %6, %bb.3
66+
%2:gpr64sp = PHI %8, %bb.1, %5, %bb.3
67+
%3:gpr64sp = PHI %7, %bb.1, %4, %bb.3
68+
early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9)
69+
%15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr
70+
early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv)
71+
%4:gpr64all = COPY %16
72+
%5:gpr64all = COPY %12
73+
%17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv
74+
%6:gpr64all = COPY %17
75+
Bcc 1, %bb.3, implicit $nzcv
76+
B %bb.2
77+
78+
...
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
2+
3+
# An acceptable loop by pipeliner: TBB == ExitBB, FBB == LoopBB, Compare and branch
4+
# CHECK: Schedule Found? 1
5+
6+
--- |
7+
define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
8+
entry:
9+
%or.cond = icmp ult i32 %n, 2
10+
br i1 %or.cond, label %for.end, label %for.body.preheader
11+
12+
for.body.preheader: ; preds = %entry
13+
%i.07 = add i32 %n, -1
14+
%0 = sext i32 %i.07 to i64
15+
br label %for.body
16+
17+
for.body: ; preds = %for.body.preheader, %for.body
18+
%indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
19+
%1 = shl nsw i64 %indvars.iv, 2
20+
%scevgep = getelementptr i8, ptr %b, i64 %1
21+
%2 = load float, ptr %scevgep, align 4
22+
%add = fadd float %2, 1.000000e+00
23+
%3 = shl nsw i64 %indvars.iv, 2
24+
%scevgep11 = getelementptr i8, ptr %a, i64 %3
25+
store float %add, ptr %scevgep11, align 4
26+
%indvars.iv.next = add nsw i64 %indvars.iv, -1
27+
%4 = add i64 %indvars.iv, -1
28+
%5 = and i64 %4, 4294967295
29+
%tobool.not = icmp eq i64 %5, 0
30+
br i1 %tobool.not, label %for.end, label %for.body
31+
32+
for.end: ; preds = %for.body, %entry
33+
ret void
34+
}
35+
36+
...
37+
---
38+
name: func
39+
tracksRegLiveness: true
40+
liveins:
41+
- { reg: '$x0', virtual-reg: '%3' }
42+
- { reg: '$x1', virtual-reg: '%4' }
43+
- { reg: '$w2', virtual-reg: '%5' }
44+
body: |
45+
bb.0.entry:
46+
liveins: $x0, $x1, $w2
47+
48+
%5:gpr32common = COPY $w2
49+
%4:gpr64common = COPY $x1
50+
%3:gpr64common = COPY $x0
51+
dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
52+
Bcc 3, %bb.3, implicit $nzcv
53+
B %bb.1
54+
55+
bb.1.for.body.preheader:
56+
%7:gpr32common = SUBWri %5, 1, 0
57+
%9:gpr64all = IMPLICIT_DEF
58+
%8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
59+
%10:gpr64 = SBFMXri killed %8, 0, 31
60+
%0:gpr64all = COPY %10
61+
%12:fpr32 = FMOVSi 112
62+
63+
bb.2.for.body:
64+
successors: %bb.3(0x04000000), %bb.2(0x7c000000)
65+
66+
%1:gpr64common = PHI %0, %bb.1, %2, %bb.2
67+
%11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
68+
%13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
69+
STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
70+
%14:gpr64common = SUBXri %1, 1, 0
71+
%2:gpr64all = COPY %14
72+
%15:gpr32 = COPY %14.sub_32
73+
CBZW killed %15, %bb.3
74+
B %bb.2
75+
76+
bb.3.for.end:
77+
RET_ReallyLR
78+
79+
...

0 commit comments

Comments
 (0)