Skip to content

Commit e0f1d9d

Browse files
committed
[ModuloSchedule] Fix modulo expansion for data loop carried dependencies.
The new experimental expansion has a problem when a value has a data dependency with an instruction from a previous stage. This is due to the way we peel out the kernel. To fix that I'm changing the way we peel out the kernel. We now peel the kernel NumberStage - 1 times. The code would be correct at this point if we didn't have to handle cases where the loop iteration is smaller than the number of stages. To handle this case we move instructions between different epilogues based on their stage and remap the PHI instructions correctly. Differential Revision: https://reviews.llvm.org/D69538
1 parent fde11e9 commit e0f1d9d

File tree

5 files changed

+195
-21
lines changed

5 files changed

+195
-21
lines changed

llvm/include/llvm/CodeGen/ModuloSchedule.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,8 @@ class PeelingModuloScheduleExpander {
299299

300300
/// State passed from peelKernel to peelPrologAndEpilogs().
301301
std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
302+
/// Illegal phis that need to be deleted once we re-link stages.
303+
SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
302304

303305
public:
304306
PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
@@ -321,6 +323,13 @@ class PeelingModuloScheduleExpander {
321323
/// Peels one iteration of the rewritten kernel (BB) in the specified
322324
/// direction.
323325
MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
326+
// Delete instructions whose stage is less than MinStage in the given basic
327+
// block.
328+
void filterInstructions(MachineBasicBlock *MB, int MinStage);
329+
// Move instructions of the given stage from sourceBB to DestBB. Remap the phi
330+
// instructions to keep a valid IR.
331+
void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
332+
MachineBasicBlock *SourceBB, unsigned Stage);
324333
/// Peel the kernel forwards and backwards to produce prologs and epilogs,
325334
/// and stitch them together.
326335
void peelPrologAndEpilogs();

llvm/lib/CodeGen/ModuloSchedule.cpp

Lines changed: 132 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1582,6 +1582,99 @@ PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
15821582
return NewBB;
15831583
}
15841584

1585+
void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
1586+
int MinStage) {
1587+
for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
1588+
I != std::next(MB->getFirstNonPHI()->getReverseIterator());) {
1589+
MachineInstr *MI = &*I++;
1590+
int Stage = getStage(MI);
1591+
if (Stage == -1 || Stage >= MinStage)
1592+
continue;
1593+
1594+
for (MachineOperand &DefMO : MI->defs()) {
1595+
SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1596+
for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1597+
// Only PHIs can use values from this block by construction.
1598+
// Match with the equivalent PHI in B.
1599+
assert(UseMI.isPHI());
1600+
Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1601+
MI->getParent());
1602+
Subs.emplace_back(&UseMI, Reg);
1603+
}
1604+
for (auto &Sub : Subs)
1605+
Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1606+
*MRI.getTargetRegisterInfo());
1607+
}
1608+
if (LIS)
1609+
LIS->RemoveMachineInstrFromMaps(*MI);
1610+
MI->eraseFromParent();
1611+
}
1612+
}
1613+
1614+
void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
1615+
MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
1616+
auto InsertPt = DestBB->getFirstNonPHI();
1617+
DenseMap<Register, Register> Remaps;
1618+
for (auto I = SourceBB->getFirstNonPHI(); I != SourceBB->end();) {
1619+
MachineInstr *MI = &*I++;
1620+
if (MI->isPHI()) {
1621+
// This is an illegal PHI. If we move any instructions using an illegal
1622+
// PHI, we need to create a legal Phi
1623+
Register PhiR = MI->getOperand(0).getReg();
1624+
auto RC = MRI.getRegClass(PhiR);
1625+
Register NR = MRI.createVirtualRegister(RC);
1626+
MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(), DebugLoc(),
1627+
TII->get(TargetOpcode::PHI), NR)
1628+
.addReg(PhiR)
1629+
.addMBB(SourceBB);
1630+
BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI;
1631+
CanonicalMIs[NI] = CanonicalMIs[MI];
1632+
Remaps[PhiR] = NR;
1633+
continue;
1634+
}
1635+
if (getStage(MI) != Stage)
1636+
continue;
1637+
MI->removeFromParent();
1638+
DestBB->insert(InsertPt, MI);
1639+
auto *KernelMI = CanonicalMIs[MI];
1640+
BlockMIs[{DestBB, KernelMI}] = MI;
1641+
BlockMIs.erase({SourceBB, KernelMI});
1642+
}
1643+
SmallVector<MachineInstr *, 4> PhiToDelete;
1644+
for (MachineInstr &MI : DestBB->phis()) {
1645+
assert(MI.getNumOperands() == 3);
1646+
MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg());
1647+
// If the instruction referenced by the phi is moved inside the block
1648+
// we don't need the phi anymore.
1649+
if (getStage(Def) == Stage) {
1650+
Register PhiReg = MI.getOperand(0).getReg();
1651+
MRI.replaceRegWith(MI.getOperand(0).getReg(),
1652+
Def->getOperand(0).getReg());
1653+
MI.getOperand(0).setReg(PhiReg);
1654+
PhiToDelete.push_back(&MI);
1655+
}
1656+
}
1657+
for (auto *P : PhiToDelete)
1658+
P->eraseFromParent();
1659+
InsertPt = DestBB->getFirstNonPHI();
1660+
for (MachineInstr &MI : SourceBB->phis()) {
1661+
MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
1662+
DestBB->insert(InsertPt, NewMI);
1663+
Register OrigR = MI.getOperand(0).getReg();
1664+
Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
1665+
NewMI->getOperand(0).setReg(R);
1666+
NewMI->getOperand(1).setReg(OrigR);
1667+
NewMI->getOperand(2).setMBB(*DestBB->pred_begin());
1668+
Remaps[OrigR] = R;
1669+
CanonicalMIs[NewMI] = CanonicalMIs[&MI];
1670+
BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NewMI;
1671+
}
1672+
for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I)
1673+
for (MachineOperand &MO : I->uses())
1674+
if (MO.isReg() && Remaps.count(MO.getReg()))
1675+
MO.setReg(Remaps[MO.getReg()]);
1676+
}
1677+
15851678
void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
15861679
BitVector LS(Schedule.getNumStages(), true);
15871680
BitVector AS(Schedule.getNumStages(), true);
@@ -1607,23 +1700,36 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
16071700

16081701
// Push out the epilogs, again in reverse order.
16091702
// We can't assume anything about the minumum loop trip count at this point,
1610-
// so emit a fairly complex epilog:
1611-
// K[0, 1, 2] // Kernel runs stages 0, 1, 2
1612-
// E0[2] <- P1 // Epilog runs stage 2 only, so the state after is [0].
1613-
// E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2.
1614-
//
1615-
// This creates a single-successor single-predecessor sequence of blocks for
1616-
// each epilog, which are kept this way for simplicity at this stage and
1617-
// cleaned up by the optimizer later.
1703+
// so emit a fairly complex epilog.
1704+
1705+
// We first peel number of stages minus one epilogue. Then we remove dead
1706+
// stages and reorder instructions based on their stage. If we have 3 stages
1707+
// we generate first:
1708+
// E0[3, 2, 1]
1709+
// E1[3', 2']
1710+
// E2[3'']
1711+
// And then we move instructions based on their stages to have:
1712+
// E0[3]
1713+
// E1[2, 3']
1714+
// E2[1, 2', 3'']
1715+
// The transformation is legal because we only move instructions past
1716+
// instructions of a previous loop iteration.
16181717
for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
1619-
Epilogs.push_back(nullptr);
1620-
for (int J = Schedule.getNumStages() - 1; J >= I; --J) {
1621-
LS.reset();
1622-
LS[J] = 1;
1623-
Epilogs.back() = peelKernel(LPD_Back);
1624-
LiveStages[Epilogs.back()] = LS;
1625-
AvailableStages[Epilogs.back()] = AS;
1718+
Epilogs.push_back(peelKernel(LPD_Back));
1719+
filterInstructions(Epilogs.back(), Schedule.getNumStages() - I);
1720+
}
1721+
for (size_t I = 0; I < Epilogs.size(); I++) {
1722+
LS.reset();
1723+
for (size_t J = I; J < Epilogs.size(); J++) {
1724+
int Iteration = J;
1725+
unsigned Stage = Schedule.getNumStages() - 1 + I - J;
1726+
// Move stage one block at a time so that Phi nodes are updated correctly.
1727+
for (size_t K = Iteration; K > I; K--)
1728+
moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
1729+
LS[Stage] = 1;
16261730
}
1731+
LiveStages[Epilogs[I]] = LS;
1732+
AvailableStages[Epilogs[I]] = AS;
16271733
}
16281734

16291735
// Now we've defined all the prolog and epilog blocks as a fallthrough
@@ -1659,6 +1765,13 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
16591765
rewriteUsesOf(MI);
16601766
}
16611767
}
1768+
for (auto *MI : IllegalPhisToDelete) {
1769+
if (LIS)
1770+
LIS->RemoveMachineInstrFromMaps(*MI);
1771+
MI->eraseFromParent();
1772+
}
1773+
IllegalPhisToDelete.clear();
1774+
16621775
// Now all remapping has been done, we're free to optimize the generated code.
16631776
for (MachineBasicBlock *B : reverse(Blocks))
16641777
EliminateDeadPhis(B, MRI, LIS);
@@ -1727,9 +1840,10 @@ void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
17271840
R = MI->getOperand(1).getReg();
17281841
MRI.setRegClass(R, MRI.getRegClass(PhiR));
17291842
MRI.replaceRegWith(PhiR, R);
1730-
if (LIS)
1731-
LIS->RemoveMachineInstrFromMaps(*MI);
1732-
MI->eraseFromParent();
1843+
// Postpone deleting the Phi as it may be referenced by BlockMIs and used
1844+
// later to figure out how to remap registers.
1845+
MI->getOperand(0).setReg(PhiR);
1846+
IllegalPhisToDelete.push_back(MI);
17331847
return;
17341848
}
17351849

llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
2-
; XFAIL: *
3-
; LSR changes required.
42

53
; This version of the conv3x3 test has both loops. This test checks that the
64
; inner loop has 13 packets.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true < %s | FileCheck %s
2+
3+
; Test epilogue generation when reading loop-carried dependency from a previous
4+
; stage. The first epilogue should read value from iteration N-1 of the kernel.
5+
6+
; CHECK: loop0
7+
; CHECK: r{{[0-9]+}} = add([[REG0:r([0-9]+)]],#8)
8+
; CHECK: [[REG0:r([0-9]+)]] = [[REG1:r([0-9]+)]]
9+
; CHECK: endloop0
10+
; CHECK: = add([[REG1]],#8)
11+
12+
; Function Attrs: nounwind
13+
define i32* @f0(i16* nocapture readonly %a0, i32 %a1) #0 {
14+
b0:
15+
%v0 = alloca [129 x i32], align 8
16+
br i1 undef, label %b1, label %b3
17+
18+
b1: ; preds = %b0
19+
br label %b2
20+
21+
b2: ; preds = %b2, %b1
22+
%v1 = phi i16* [ %a0, %b1 ], [ %v2, %b2 ]
23+
%v2 = phi i16* [ undef, %b1 ], [ %v15, %b2 ]
24+
%v3 = phi i32* [ null, %b1 ], [ %v4, %b2 ]
25+
%v4 = phi i32* [ null, %b1 ], [ %v14, %b2 ]
26+
%v5 = phi i32 [ 0, %b1 ], [ %v13, %b2 ]
27+
%v6 = phi i16* [ undef, %b1 ], [ %v12, %b2 ]
28+
%v7 = load i16, i16* %v2, align 2
29+
%v8 = sext i16 %v7 to i32
30+
%v9 = call i32 @llvm.hexagon.M2.mpy.ll.s0(i32 %v8, i32 %v8) #2
31+
%v10 = load i16, i16* %v6, align 2
32+
%v11 = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32 %v9, i32 undef, i32 undef) #2
33+
store i32 %v11, i32* %v4, align 4
34+
%v12 = getelementptr inbounds i16, i16* %v6, i32 -1
35+
%v13 = add i32 %v5, 1
36+
%v14 = getelementptr inbounds i32, i32* %v3, i32 2
37+
%v15 = getelementptr inbounds i16, i16* %v1, i32 2
38+
%v16 = icmp slt i32 %v13, %a1
39+
br i1 %v16, label %b2, label %b3
40+
41+
b3: ; preds = %b2, %b0
42+
%out = phi i32* [ null, %b0 ], [ %v14, %b2 ]
43+
ret i32* %out
44+
}
45+
46+
; Function Attrs: nounwind readnone
47+
declare i32 @llvm.hexagon.M2.mpy.ll.s0(i32, i32) #1
48+
49+
; Function Attrs: nounwind readnone
50+
declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32, i32, i32) #1
51+
52+
attributes #0 = { nounwind "target-cpu"="hexagonv60" }
53+
attributes #1 = { nounwind readnone }
54+
attributes #2 = { nounwind }

llvm/test/CodeGen/Hexagon/swp-stages4.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
; CHECK: = and
77
; CHECK: = and
8-
; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1)
98
; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255)
109
; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255)
1110
; CHECK: loop0(.LBB0_[[LOOP:.]],

0 commit comments

Comments
 (0)