Skip to content

Commit 996c869

Browse files
committed
[MachinePipeliner] Fix loop-carried dependencies analysis
In current MachinePipeliner, several loop-carried edges are missed. It can result generating invalid code. At least following loop-carried dependencies can be missed. - Memory dependencies from top to bottom. - Example: ``` for (int i=1; i<n; i++) { a[i] = ...; a[i-1] = ...; } ``` - Store to store dependencies. - Store to load dependencies. - Output (write-after-write) dependencies. - Use of alias analysis results that are valid only in the single iteration. - Example: ``` void f(double * restrict a, double * restrict b); ... for (int i=0; i<n; i++) f(ptr0, ptr1); // will be inlined ``` This patch added these dependencies and fix correctness issues. In addition, the current analysis can add excessive dependencies because loop-carried memory dependence from bottom to top by forward direction (i.e., top to bottom) edge. This patch also removes such dependencies.
1 parent 8a0914c commit 996c869

24 files changed

+1774
-462
lines changed

llvm/include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 44 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242

4343
#include "llvm/ADT/STLExtras.h"
4444
#include "llvm/ADT/SetVector.h"
45+
#include "llvm/Analysis/AliasAnalysis.h"
4546
#include "llvm/CodeGen/DFAPacketizer.h"
4647
#include "llvm/CodeGen/MachineDominators.h"
4748
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
@@ -190,6 +191,33 @@ class SwingSchedulerDDGEdge {
190191
bool ignoreDependence(bool IgnoreAnti) const;
191192
};
192193

194+
struct LoopCarriedEdges {
195+
using OutputDep = SmallDenseMap<Register, SmallSetVector<SUnit *, 4>>;
196+
using OrderDep = SmallSetVector<SUnit *, 8>;
197+
using OutputDepsType = DenseMap<SUnit *, OutputDep>;
198+
using OrderDepsType = DenseMap<SUnit *, OrderDep>;
199+
200+
OutputDepsType OutputDeps;
201+
OrderDepsType OrderDeps;
202+
203+
const OutputDep *getOutputDepOrNull(SUnit *Key) const {
204+
auto Ite = OutputDeps.find(Key);
205+
if (Ite == OutputDeps.end())
206+
return nullptr;
207+
return &Ite->second;
208+
}
209+
210+
const OrderDep *getOrderDepOrNull(SUnit *Key) const {
211+
auto Ite = OrderDeps.find(Key);
212+
if (Ite == OrderDeps.end())
213+
return nullptr;
214+
return &Ite->second;
215+
}
216+
217+
void dump(SUnit *SU, const TargetRegisterInfo *TRI,
218+
const MachineRegisterInfo *MRI) const;
219+
};
220+
193221
/// Represents dependencies between instructions. This class is a wrapper of
194222
/// `SUnits` and its dependencies to manipulate back-edges in a natural way.
195223
/// Currently it only supports back-edges via PHI, which are expressed as
@@ -217,8 +245,12 @@ class SwingSchedulerDDG {
217245
SwingSchedulerDDGEdges &getEdges(const SUnit *SU);
218246
const SwingSchedulerDDGEdges &getEdges(const SUnit *SU) const;
219247

248+
void addLoopCarriedEdges(std::vector<SUnit> &SUnits,
249+
const LoopCarriedEdges &LCE);
250+
220251
public:
221-
SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU);
252+
SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU,
253+
const LoopCarriedEdges &LCE);
222254

223255
const EdgesType &getInEdges(const SUnit *SU) const;
224256

@@ -285,22 +317,14 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
285317
BitVector Blocked;
286318
SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
287319
SmallVector<SmallVector<int, 4>, 16> AdjK;
288-
// Node to Index from ScheduleDAGTopologicalSort
289-
std::vector<int> *Node2Idx;
320+
SmallVector<BitVector, 16> LoopCarried;
290321
unsigned NumPaths = 0u;
291-
static unsigned MaxPaths;
292322

293323
public:
294-
Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
295-
: SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {
296-
Node2Idx = new std::vector<int>(SUs.size());
297-
unsigned Idx = 0;
298-
for (const auto &NodeNum : Topo)
299-
Node2Idx->at(NodeNum) = Idx++;
300-
}
324+
Circuits(std::vector<SUnit> &SUs)
325+
: SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {}
301326
Circuits &operator=(const Circuits &other) = delete;
302327
Circuits(const Circuits &other) = delete;
303-
~Circuits() { delete Node2Idx; }
304328

305329
/// Reset the data structures used in the circuit algorithm.
306330
void reset() {
@@ -310,9 +334,9 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
310334
NumPaths = 0;
311335
}
312336

313-
void createAdjacencyStructure(SwingSchedulerDAG *DAG);
337+
void createAdjacencyStructure(const SwingSchedulerDDG *DDG);
314338
bool circuit(int V, int S, NodeSetType &NodeSets,
315-
const SwingSchedulerDAG *DAG, bool HasBackedge = false);
339+
const SwingSchedulerDDG *DDG, bool HasLoopCarriedEdge = false);
316340
void unblock(int U);
317341
};
318342

@@ -366,7 +390,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
366390
return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
367391
}
368392

369-
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const;
393+
bool hasLoopCarriedMemDep(const MachineInstr *Src, const MachineInstr *Dst,
394+
BatchAAResults *BAA) const;
370395

371396
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
372397

@@ -390,11 +415,11 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
390415

391416
const SwingSchedulerDDG *getDDG() const { return DDG.get(); }
392417

393-
bool mayOverlapInLaterIter(const MachineInstr *BaseMI,
394-
const MachineInstr *OtherMI) const;
418+
AliasResult::Kind mayOverlapInLaterIter(const MachineInstr *BaseMI,
419+
const MachineInstr *OtherMI) const;
395420

396421
private:
397-
void addLoopCarriedDependences(AAResults *AA);
422+
LoopCarriedEdges addLoopCarriedDependences(AAResults *AA);
398423
void updatePhiDependences();
399424
void changeDependences();
400425
unsigned calculateResMII();
@@ -440,7 +465,7 @@ class NodeSet {
440465
using iterator = SetVector<SUnit *>::const_iterator;
441466

442467
NodeSet() = default;
443-
NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG)
468+
NodeSet(iterator S, iterator E, const SwingSchedulerDDG *DDG)
444469
: Nodes(S, E), HasRecurrence(true) {
445470
// Calculate the latency of this node set.
446471
// Example to demonstrate the calculation:
@@ -456,7 +481,6 @@ class NodeSet {
456481
//
457482
// Hold a map from each SUnit in the circle to the maximum distance from the
458483
// source node by only considering the nodes.
459-
const SwingSchedulerDDG *DDG = DAG->getDDG();
460484
DenseMap<SUnit *, unsigned> SUnitToDistance;
461485
for (auto *Node : Nodes)
462486
SUnitToDistance[Node] = 0;
@@ -474,22 +498,6 @@ class NodeSet {
474498
DV = DU + Succ.getLatency();
475499
}
476500
}
477-
// Handle a back-edge in loop carried dependencies
478-
SUnit *FirstNode = Nodes[0];
479-
SUnit *LastNode = Nodes[Nodes.size() - 1];
480-
481-
for (auto &PI : DDG->getInEdges(LastNode)) {
482-
// If we have an order dep that is potentially loop carried then a
483-
// back-edge exists between the last node and the first node that isn't
484-
// modeled in the DAG. Handle it manually by adding 1 to the distance of
485-
// the last node.
486-
if (PI.getSrc() != FirstNode || !PI.isOrderDep() ||
487-
!DAG->isLoopCarriedDep(PI))
488-
continue;
489-
SUnitToDistance[FirstNode] =
490-
std::max(SUnitToDistance[FirstNode], SUnitToDistance[LastNode] + 1);
491-
}
492-
493501
// The latency is the distance from the source node to itself.
494502
Latency = SUnitToDistance[Nodes.front()];
495503
}

0 commit comments

Comments
 (0)