Skip to content

Commit e6ab32b

Browse files
committed
[MachinePipeliner] Improve loop carried dependence analysis
The previous implementation had false positive/negative cases in analysis of loop carried dependence. A case of missed dependencies is caused by incorrect analysis of address increments. It is fixed by a strict analysis of recursive definitions. See added test swp-carried-dep4.mir. Excessive detection of dependence is corrected by improving the formula for determining overlap of address ranges to be accessed. See added test swp-carried-dep5.mir.
1 parent 4ec06b1 commit e6ab32b

File tree

8 files changed

+397
-82
lines changed

8 files changed

+397
-82
lines changed

llvm/include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,9 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
294294

295295
static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
296296

297+
bool mayOverlapInLaterIter(const MachineInstr *BaseMI,
298+
const MachineInstr *OtherMI) const;
299+
297300
private:
298301
void addLoopCarriedDependences(AAResults *AA);
299302
void updatePhiDependences();
@@ -313,7 +316,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
313316
void computeNodeOrder(NodeSetType &NodeSets);
314317
void checkValidNodeOrder(const NodeSetType &Circuits) const;
315318
bool schedulePipeline(SMSchedule &Schedule);
316-
bool computeDelta(MachineInstr &MI, unsigned &Delta) const;
319+
bool computeDelta(const MachineInstr &MI, int &Delta) const;
317320
MachineInstr *findDefInLoop(Register Reg);
318321
bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
319322
unsigned &OffsetPos, unsigned &NewBase,

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 188 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -2568,9 +2568,104 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
25682568
return scheduleFound && Schedule.getMaxStageCount() > 0;
25692569
}
25702570

2571+
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI) {
2572+
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2573+
Register Result;
2574+
for (const MachineOperand &Use : MI.all_uses()) {
2575+
Register Reg = Use.getReg();
2576+
if (!Reg.isVirtual())
2577+
return Register();
2578+
if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2579+
continue;
2580+
if (Result)
2581+
return Register();
2582+
Result = Reg;
2583+
}
2584+
return Result;
2585+
}
2586+
2587+
/// When Op is a value that is incremented recursively in a loop and there is a
2588+
/// unique instruction that increments it, returns true and sets Value.
2589+
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value) {
2590+
if (!Op.isReg() || !Op.getReg().isVirtual())
2591+
return false;
2592+
2593+
Register OrgReg = Op.getReg();
2594+
Register CurReg = OrgReg;
2595+
const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2596+
const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2597+
2598+
const TargetInstrInfo *TII =
2599+
LoopBB->getParent()->getSubtarget().getInstrInfo();
2600+
const TargetRegisterInfo *TRI =
2601+
LoopBB->getParent()->getSubtarget().getRegisterInfo();
2602+
2603+
MachineInstr *Phi = nullptr;
2604+
MachineInstr *Increment = nullptr;
2605+
2606+
// Traverse definitions until it reaches Op or an instruction that does not
2607+
// satisfy the condition.
2608+
// Acceptable example:
2609+
// bb.0:
2610+
// %0 = PHI %3, %bb.0, ...
2611+
// %2 = ADD %0, Value
2612+
// ... = LOAD %2(Op)
2613+
// %3 = COPY %2
2614+
while (true) {
2615+
if (!CurReg.isValid() || !CurReg.isVirtual())
2616+
return false;
2617+
MachineInstr *Def = MRI.getVRegDef(CurReg);
2618+
if (Def->getParent() != LoopBB)
2619+
return false;
2620+
2621+
if (Def->isCopy()) {
2622+
// Ignore copy instructions unless they contain subregisters
2623+
if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
2624+
return false;
2625+
CurReg = Def->getOperand(1).getReg();
2626+
} else if (Def->isPHI()) {
2627+
// There must be just one Phi
2628+
if (Phi)
2629+
return false;
2630+
Phi = Def;
2631+
CurReg = getLoopPhiReg(*Def, LoopBB);
2632+
} else if (TII->getIncrementValue(*Def, Value)) {
2633+
// Potentially a unique increment
2634+
if (Increment)
2635+
// Multiple increments exist
2636+
return false;
2637+
2638+
const MachineOperand *BaseOp;
2639+
int64_t Offset;
2640+
bool OffsetIsScalable;
2641+
if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,
2642+
TRI)) {
2643+
// Pre/post increment instruction
2644+
CurReg = BaseOp->getReg();
2645+
} else {
2646+
// If only one of the operands is defined within the loop, it is assumed
2647+
// to be an incremented value.
2648+
CurReg = findUniqueOperandDefinedInLoop(*Def);
2649+
if (!CurReg.isValid())
2650+
return false;
2651+
}
2652+
Increment = Def;
2653+
} else {
2654+
return false;
2655+
}
2656+
if (CurReg == OrgReg)
2657+
break;
2658+
}
2659+
2660+
if (!Phi || !Increment)
2661+
return false;
2662+
2663+
return true;
2664+
}
2665+
25712666
/// Return true if we can compute the amount the instruction changes
25722667
/// during each iteration. Set Delta to the amount of the change.
2573-
bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
2668+
bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
25742669
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
25752670
const MachineOperand *BaseOp;
25762671
int64_t Offset;
@@ -2585,24 +2680,7 @@ bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
25852680
if (!BaseOp->isReg())
25862681
return false;
25872682

2588-
Register BaseReg = BaseOp->getReg();
2589-
2590-
MachineRegisterInfo &MRI = MF.getRegInfo();
2591-
// Check if there is a Phi. If so, get the definition in the loop.
2592-
MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2593-
if (BaseDef && BaseDef->isPHI()) {
2594-
BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2595-
BaseDef = MRI.getVRegDef(BaseReg);
2596-
}
2597-
if (!BaseDef)
2598-
return false;
2599-
2600-
int D = 0;
2601-
if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2602-
return false;
2603-
2604-
Delta = D;
2605-
return true;
2683+
return findLoopIncrementValue(*BaseOp, Delta);
26062684
}
26072685

26082686
/// Check if we can change the instruction to use an offset value from the
@@ -2720,6 +2798,96 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
27202798
return Def;
27212799
}
27222800

2801+
/// Return false if there is no overlap between the region accessed by BaseMI in
2802+
/// an iteration and the region accessed by OtherMI in subsequent iterations.
2803+
bool SwingSchedulerDAG::mayOverlapInLaterIter(
2804+
const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
2805+
int DeltaB, DeltaO, Delta;
2806+
if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
2807+
DeltaB != DeltaO)
2808+
return true;
2809+
Delta = DeltaB;
2810+
2811+
const MachineOperand *BaseOpB, *BaseOpO;
2812+
int64_t OffsetB, OffsetO;
2813+
bool OffsetBIsScalable, OffsetOIsScalable;
2814+
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2815+
if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,
2816+
OffsetBIsScalable, TRI) ||
2817+
!TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
2818+
OffsetOIsScalable, TRI))
2819+
return true;
2820+
2821+
if (OffsetBIsScalable || OffsetOIsScalable)
2822+
return true;
2823+
2824+
if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
2825+
// Pass cases with different base operands but same initial values.
2826+
// Typically for when pre/post increment is used.
2827+
2828+
if (!BaseOpB->isReg() || !BaseOpO->isReg())
2829+
return true;
2830+
Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
2831+
if (!RegB.isVirtual() || !RegO.isVirtual())
2832+
return true;
2833+
2834+
MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
2835+
MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
2836+
if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
2837+
return true;
2838+
2839+
unsigned InitValB = 0;
2840+
unsigned LoopValB = 0;
2841+
unsigned InitValO = 0;
2842+
unsigned LoopValO = 0;
2843+
getPhiRegs(*DefB, BB, InitValB, LoopValB);
2844+
getPhiRegs(*DefO, BB, InitValO, LoopValO);
2845+
MachineInstr *InitDefB = MRI.getVRegDef(InitValB);
2846+
MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
2847+
2848+
if (!InitDefB->isIdenticalTo(*InitDefO))
2849+
return true;
2850+
}
2851+
2852+
LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
2853+
LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
2854+
2855+
// This is the main test, which checks the offset values and the loop
2856+
// increment value to determine if the accesses may be loop carried.
2857+
if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
2858+
return true;
2859+
2860+
LLVM_DEBUG({
2861+
dbgs() << "Overlap check:\n";
2862+
dbgs() << " BaseMI: ";
2863+
BaseMI->dump();
2864+
dbgs() << " Base + " << OffsetB << " + I * " << Delta
2865+
<< ", Len: " << AccessSizeB.getValue() << "\n";
2866+
dbgs() << " OtherMI: ";
2867+
OtherMI->dump();
2868+
dbgs() << " Base + " << OffsetO << " + I * " << Delta
2869+
<< ", Len: " << AccessSizeO.getValue() << "\n";
2870+
});
2871+
2872+
if (Delta < 0) {
2873+
int64_t BaseMinAddr = OffsetB;
2874+
int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
2875+
if (BaseMinAddr > OhterNextIterMaxAddr) {
2876+
LLVM_DEBUG(dbgs() << " Result: No overlap\n");
2877+
return false;
2878+
}
2879+
} else {
2880+
int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
2881+
int64_t OtherNextIterMinAddr = OffsetO + Delta;
2882+
if (BaseMaxAddr < OtherNextIterMinAddr) {
2883+
LLVM_DEBUG(dbgs() << " Result: No overlap\n");
2884+
return false;
2885+
}
2886+
}
2887+
LLVM_DEBUG(dbgs() << " Result: Overlap\n");
2888+
return true;
2889+
}
2890+
27232891
/// Return true for an order or output dependence that is loop carried
27242892
/// potentially. A dependence is loop carried if the destination defines a value
27252893
/// that may be used or defined by the source in a subsequent iteration.
@@ -2753,61 +2921,7 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
27532921
// The conservative assumption is that a dependence between memory operations
27542922
// may be loop carried. The following code checks when it can be proved that
27552923
// there is no loop carried dependence.
2756-
unsigned DeltaS, DeltaD;
2757-
if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2758-
return true;
2759-
2760-
const MachineOperand *BaseOpS, *BaseOpD;
2761-
int64_t OffsetS, OffsetD;
2762-
bool OffsetSIsScalable, OffsetDIsScalable;
2763-
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2764-
if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2765-
TRI) ||
2766-
!TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2767-
TRI))
2768-
return true;
2769-
2770-
assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2771-
"Expected offsets to be byte offsets");
2772-
2773-
MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2774-
MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2775-
if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2776-
return true;
2777-
2778-
unsigned InitValS = 0;
2779-
unsigned LoopValS = 0;
2780-
unsigned InitValD = 0;
2781-
unsigned LoopValD = 0;
2782-
getPhiRegs(*DefS, BB, InitValS, LoopValS);
2783-
getPhiRegs(*DefD, BB, InitValD, LoopValD);
2784-
MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2785-
MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2786-
2787-
if (!InitDefS->isIdenticalTo(*InitDefD))
2788-
return true;
2789-
2790-
// Check that the base register is incremented by a constant value for each
2791-
// iteration.
2792-
MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2793-
int D = 0;
2794-
if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2795-
return true;
2796-
2797-
LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
2798-
LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
2799-
2800-
// This is the main test, which checks the offset values and the loop
2801-
// increment value to determine if the accesses may be loop carried.
2802-
if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
2803-
return true;
2804-
2805-
if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
2806-
DeltaD < AccessSizeD.getValue())
2807-
return true;
2808-
2809-
return (OffsetS + (int64_t)AccessSizeS.getValue() <
2810-
OffsetD + (int64_t)AccessSizeD.getValue());
2924+
return mayOverlapInLaterIter(DI, SI);
28112925
}
28122926

28132927
void SwingSchedulerDAG::postProcessDAG() {

llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@
33

44
# Test that the loop carried dependence check correctly identifies a recurrence.
55

6+
# CHECK: Overlap check:
7+
# CHECK-NEXT: BaseMI: S2_storerh_io %{{[0-9]+}}:intregs, 0, %{{[0-9]+}}:intregs :: (store (s16) into %ir.lsr.iv24)
8+
# CHECK-NEXT: Base + 0 + I * 4, Len: 2
9+
# CHECK-NEXT: OtherMI: %{{[0-9]+}}:intregs = L2_loadrh_io %{{[0-9]+}}:intregs, -8 :: (load (s16) from %ir.cgep10)
10+
# CHECK-NEXT: Base + -8 + I * 4, Len: 2
11+
# CHECK-NEXT: Result: Overlap
12+
613
# CHECK: Rec NodeSet
714
# CHECK: Rec NodeSet
815
# CHECK: Rec NodeSet

llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
22
# REQUIRES: asserts
33

4-
# Test that the loop carried dependence check correctly identifies a recurrence
4+
# Test that it correctly recognizes that there is no loop carried dependence
55
# when the loop variable decreases and the array index offset is negative.
66

7-
# CHECK: Rec NodeSet
8-
# CHECK: Rec NodeSet
9-
# CHECK: SU(3)
10-
# CHECK: SU(4)
11-
# CHECK: SU(5)
7+
# CHECK: Overlap check:
8+
# CHECK-NEXT: BaseMI: S2_storeri_io %{{[0-9]+}}:intregs, 0, %{{[0-9]+}}:intregs :: (store (s32) into %ir.lsr.iv1)
9+
# CHECK-NEXT: Base + 0 + I * -4, Len: 4
10+
# CHECK-NEXT: OtherMI: %{{[0-9]+}}:intregs = L2_loadri_io %{{[0-9]+}}:intregs, -8 :: (load (s32) from %ir.cgep)
11+
# CHECK-NEXT: Base + -8 + I * -4, Len: 4
12+
# CHECK-NEXT: Result: No overlap
1213

1314
--- |
1415

llvm/test/CodeGen/Hexagon/swp-carried-dep3.mir

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@
77
# requires to use a single CHECK-NOT to match such a Rec NodeSet. Fortunately
88
# the atom '.' does not match a newline but anything else on a line.
99

10+
# CHECK: Overlap check:
11+
# CHECK-NEXT: BaseMI: %13:intregs = S2_storerh_pi %12:intregs(tied-def 0), 2, %20:intregs :: (store (s16))
12+
# CHECK-NEXT: Base + 0 + I * 2, Len: 2
13+
# CHECK-NEXT: OtherMI: %19:intregs, %15:intregs = L2_loadrh_pi %14:intregs(tied-def 1), 2 :: (load (s16))
14+
# CHECK-NEXT: Base + 0 + I * 2, Len: 2
15+
# CHECK-NEXT: Result: No overlap
16+
1017
# CHECK-NOT: Rec NodeSet{{.+[[:space:]]}} SU(5){{.+[[:space:]]}} SU(7)
1118

1219
...
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s
2+
# REQUIRES: asserts
3+
4+
# Loop carried dependence is assumed in cases where increment value cannot be recognized
5+
# (Not supported for multiple increment instruction)
6+
7+
# CHECK: Rec NodeSet
8+
# CHECK: Rec NodeSet
9+
# CHECK-NEXT: SU(1)
10+
# CHECK-NEXT: SU(2)
11+
12+
---
13+
name: test
14+
tracksRegLiveness: true
15+
16+
body: |
17+
bb.0:
18+
successors: %bb.1
19+
20+
%10:intregs = IMPLICIT_DEF
21+
%11:intregs = IMPLICIT_DEF
22+
J2_loop0i %bb.1, 6, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
23+
24+
bb.1 (machine-block-address-taken):
25+
successors: %bb.1, %bb.2
26+
27+
%0:intregs = PHI %11, %bb.0, %6, %bb.1
28+
%4:intregs = L2_loadri_io %0, 0 :: (load (s32))
29+
S2_storeri_io %0, 0, %10 :: (store (s32))
30+
%7:intregs = A2_addi %0, -8
31+
%6:intregs = A2_addi %7, 4
32+
ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
33+
J2_jump %bb.2, implicit-def dead $pc
34+
35+
bb.2:
36+
37+
...

0 commit comments

Comments
 (0)