@@ -1582,6 +1582,99 @@ PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
1582
1582
return NewBB;
1583
1583
}
1584
1584
1585
+ void PeelingModuloScheduleExpander::filterInstructions (MachineBasicBlock *MB,
1586
+ int MinStage) {
1587
+ for (auto I = MB->getFirstInstrTerminator ()->getReverseIterator ();
1588
+ I != std::next (MB->getFirstNonPHI ()->getReverseIterator ());) {
1589
+ MachineInstr *MI = &*I++;
1590
+ int Stage = getStage (MI);
1591
+ if (Stage == -1 || Stage >= MinStage)
1592
+ continue ;
1593
+
1594
+ for (MachineOperand &DefMO : MI->defs ()) {
1595
+ SmallVector<std::pair<MachineInstr *, Register>, 4 > Subs;
1596
+ for (MachineInstr &UseMI : MRI.use_instructions (DefMO.getReg ())) {
1597
+ // Only PHIs can use values from this block by construction.
1598
+ // Match with the equivalent PHI in B.
1599
+ assert (UseMI.isPHI ());
1600
+ Register Reg = getEquivalentRegisterIn (UseMI.getOperand (0 ).getReg (),
1601
+ MI->getParent ());
1602
+ Subs.emplace_back (&UseMI, Reg);
1603
+ }
1604
+ for (auto &Sub : Subs)
1605
+ Sub.first ->substituteRegister (DefMO.getReg (), Sub.second , /* SubIdx=*/ 0 ,
1606
+ *MRI.getTargetRegisterInfo ());
1607
+ }
1608
+ if (LIS)
1609
+ LIS->RemoveMachineInstrFromMaps (*MI);
1610
+ MI->eraseFromParent ();
1611
+ }
1612
+ }
1613
+
1614
+ void PeelingModuloScheduleExpander::moveStageBetweenBlocks (
1615
+ MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
1616
+ auto InsertPt = DestBB->getFirstNonPHI ();
1617
+ DenseMap<Register, Register> Remaps;
1618
+ for (auto I = SourceBB->getFirstNonPHI (); I != SourceBB->end ();) {
1619
+ MachineInstr *MI = &*I++;
1620
+ if (MI->isPHI ()) {
1621
+ // This is an illegal PHI. If we move any instructions using an illegal
1622
+ // PHI, we need to create a legal Phi
1623
+ Register PhiR = MI->getOperand (0 ).getReg ();
1624
+ auto RC = MRI.getRegClass (PhiR);
1625
+ Register NR = MRI.createVirtualRegister (RC);
1626
+ MachineInstr *NI = BuildMI (*DestBB, DestBB->getFirstNonPHI (), DebugLoc (),
1627
+ TII->get (TargetOpcode::PHI), NR)
1628
+ .addReg (PhiR)
1629
+ .addMBB (SourceBB);
1630
+ BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI;
1631
+ CanonicalMIs[NI] = CanonicalMIs[MI];
1632
+ Remaps[PhiR] = NR;
1633
+ continue ;
1634
+ }
1635
+ if (getStage (MI) != Stage)
1636
+ continue ;
1637
+ MI->removeFromParent ();
1638
+ DestBB->insert (InsertPt, MI);
1639
+ auto *KernelMI = CanonicalMIs[MI];
1640
+ BlockMIs[{DestBB, KernelMI}] = MI;
1641
+ BlockMIs.erase ({SourceBB, KernelMI});
1642
+ }
1643
+ SmallVector<MachineInstr *, 4 > PhiToDelete;
1644
+ for (MachineInstr &MI : DestBB->phis ()) {
1645
+ assert (MI.getNumOperands () == 3 );
1646
+ MachineInstr *Def = MRI.getVRegDef (MI.getOperand (1 ).getReg ());
1647
+ // If the instruction referenced by the phi is moved inside the block
1648
+ // we don't need the phi anymore.
1649
+ if (getStage (Def) == Stage) {
1650
+ Register PhiReg = MI.getOperand (0 ).getReg ();
1651
+ MRI.replaceRegWith (MI.getOperand (0 ).getReg (),
1652
+ Def->getOperand (0 ).getReg ());
1653
+ MI.getOperand (0 ).setReg (PhiReg);
1654
+ PhiToDelete.push_back (&MI);
1655
+ }
1656
+ }
1657
+ for (auto *P : PhiToDelete)
1658
+ P->eraseFromParent ();
1659
+ InsertPt = DestBB->getFirstNonPHI ();
1660
+ for (MachineInstr &MI : SourceBB->phis ()) {
1661
+ MachineInstr *NewMI = MF.CloneMachineInstr (&MI);
1662
+ DestBB->insert (InsertPt, NewMI);
1663
+ Register OrigR = MI.getOperand (0 ).getReg ();
1664
+ Register R = MRI.createVirtualRegister (MRI.getRegClass (OrigR));
1665
+ NewMI->getOperand (0 ).setReg (R);
1666
+ NewMI->getOperand (1 ).setReg (OrigR);
1667
+ NewMI->getOperand (2 ).setMBB (*DestBB->pred_begin ());
1668
+ Remaps[OrigR] = R;
1669
+ CanonicalMIs[NewMI] = CanonicalMIs[&MI];
1670
+ BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NewMI;
1671
+ }
1672
+ for (auto I = DestBB->getFirstNonPHI (); I != DestBB->end (); ++I)
1673
+ for (MachineOperand &MO : I->uses ())
1674
+ if (MO.isReg () && Remaps.count (MO.getReg ()))
1675
+ MO.setReg (Remaps[MO.getReg ()]);
1676
+ }
1677
+
1585
1678
void PeelingModuloScheduleExpander::peelPrologAndEpilogs () {
1586
1679
BitVector LS (Schedule.getNumStages (), true );
1587
1680
BitVector AS (Schedule.getNumStages (), true );
@@ -1607,23 +1700,36 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
1607
1700
1608
1701
// Push out the epilogs, again in reverse order.
1609
1702
// We can't assume anything about the minumum loop trip count at this point,
1610
- // so emit a fairly complex epilog:
1611
- // K[0, 1, 2] // Kernel runs stages 0, 1, 2
1612
- // E0[2] <- P1 // Epilog runs stage 2 only, so the state after is [0].
1613
- // E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2.
1614
- //
1615
- // This creates a single-successor single-predecessor sequence of blocks for
1616
- // each epilog, which are kept this way for simplicity at this stage and
1617
- // cleaned up by the optimizer later.
1703
+ // so emit a fairly complex epilog.
1704
+
1705
+ // We first peel number of stages minus one epilogue. Then we remove dead
1706
+ // stages and reorder instructions based on their stage. If we have 3 stages
1707
+ // we generate first:
1708
+ // E0[3, 2, 1]
1709
+ // E1[3', 2']
1710
+ // E2[3'']
1711
+ // And then we move instructions based on their stages to have:
1712
+ // E0[3]
1713
+ // E1[2, 3']
1714
+ // E2[1, 2', 3'']
1715
+ // The transformation is legal because we only move instructions past
1716
+ // instructions of a previous loop iteration.
1618
1717
for (int I = 1 ; I <= Schedule.getNumStages () - 1 ; ++I) {
1619
- Epilogs.push_back (nullptr );
1620
- for (int J = Schedule.getNumStages () - 1 ; J >= I; --J) {
1621
- LS.reset ();
1622
- LS[J] = 1 ;
1623
- Epilogs.back () = peelKernel (LPD_Back);
1624
- LiveStages[Epilogs.back ()] = LS;
1625
- AvailableStages[Epilogs.back ()] = AS;
1718
+ Epilogs.push_back (peelKernel (LPD_Back));
1719
+ filterInstructions (Epilogs.back (), Schedule.getNumStages () - I);
1720
+ }
1721
+ for (size_t I = 0 ; I < Epilogs.size (); I++) {
1722
+ LS.reset ();
1723
+ for (size_t J = I; J < Epilogs.size (); J++) {
1724
+ int Iteration = J;
1725
+ unsigned Stage = Schedule.getNumStages () - 1 + I - J;
1726
+ // Move stage one block at a time so that Phi nodes are updated correctly.
1727
+ for (size_t K = Iteration; K > I; K--)
1728
+ moveStageBetweenBlocks (Epilogs[K - 1 ], Epilogs[K], Stage);
1729
+ LS[Stage] = 1 ;
1626
1730
}
1731
+ LiveStages[Epilogs[I]] = LS;
1732
+ AvailableStages[Epilogs[I]] = AS;
1627
1733
}
1628
1734
1629
1735
// Now we've defined all the prolog and epilog blocks as a fallthrough
@@ -1659,6 +1765,13 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
1659
1765
rewriteUsesOf (MI);
1660
1766
}
1661
1767
}
1768
+ for (auto *MI : IllegalPhisToDelete) {
1769
+ if (LIS)
1770
+ LIS->RemoveMachineInstrFromMaps (*MI);
1771
+ MI->eraseFromParent ();
1772
+ }
1773
+ IllegalPhisToDelete.clear ();
1774
+
1662
1775
// Now all remapping has been done, we're free to optimize the generated code.
1663
1776
for (MachineBasicBlock *B : reverse (Blocks))
1664
1777
EliminateDeadPhis (B, MRI, LIS);
@@ -1727,9 +1840,10 @@ void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
1727
1840
R = MI->getOperand (1 ).getReg ();
1728
1841
MRI.setRegClass (R, MRI.getRegClass (PhiR));
1729
1842
MRI.replaceRegWith (PhiR, R);
1730
- if (LIS)
1731
- LIS->RemoveMachineInstrFromMaps (*MI);
1732
- MI->eraseFromParent ();
1843
+ // Postpone deleting the Phi as it may be referenced by BlockMIs and used
1844
+ // later to figure out how to remap registers.
1845
+ MI->getOperand (0 ).setReg (PhiR);
1846
+ IllegalPhisToDelete.push_back (MI);
1733
1847
return ;
1734
1848
}
1735
1849
0 commit comments