[ARM] Small reorganisation in ARMParallelDSP

sparker-arm · sparker-arm · commit 0a1353a3c8c9 · 2018-11-09T09:18:00.000Z
A few code movement things: - AreSymmetrical is now a method of BinOpChain. - Created a lambda in CreateParallelMACPairs to reduce loop nesting. - A Reduction object now gets pasted in a couple of places instead, including CreateParallelMACPairs so it doesn't need to return a value. I've also added RecordSequentialLoads, which is run before the transformation begins, and caches the interesting loads. This can then be queried later instead of cross checking many load values. Differential Revision: https://reviews.llvm.org/D54254 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346479 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -99,16 +99,18 @@ namespace {
         for (auto *V : RHS)
           AllValues.push_back(V);
       }
+
+    bool AreSymmetrical(BinOpChain *Other);
   };
 
   struct Reduction {
     PHINode         *Phi;             // The Phi-node from where we start
                                       // pattern matching.
     Instruction     *AccIntAdd;       // The accumulating integer add statement,
                                       // i.e, the reduction statement.
-
     OpChainList     MACCandidates;    // The MAC candidates associated with
                                       // this reduction statement.
+    PMACPairList    PMACPairs;
     Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
   };
 
@@ -121,10 +123,13 @@ namespace {
     Loop              *L;
     const DataLayout  *DL;
     Module            *M;
+    std::map<LoadInst*, LoadInst*> LoadPairs;
+    std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads;
 
-    bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
+    bool RecordSequentialLoads(BasicBlock *Header);
+    bool InsertParallelMACs(Reduction &Reduction);
     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
-    PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
+    void CreateParallelMACPairs(Reduction &R);
     Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
                                  Instruction *Acc, bool Exchange,
                                  Instruction *InsertAfter);
@@ -202,6 +207,12 @@ namespace {
 
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
+
+      if (!RecordSequentialLoads(Header)) {
+        LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
+        return false;
+      }
+
       Changes = MatchSMLAD(F);
       return Changes;
     }
@@ -254,58 +265,14 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) {
   return false;
 }
 
-// Element-by-element comparison of Value lists returning true if they are
-// instructions with the same opcode or constants with the same value.
-static bool AreSymmetrical(const ValueList &VL0,
-                           const ValueList &VL1) {
-  if (VL0.size() != VL1.size()) {
-    LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
-                      << VL0.size() << " != " << VL1.size() << "\n");
-    return false;
-  }
-
-  const unsigned Pairs = VL0.size();
-  LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
-
-  for (unsigned i = 0; i < Pairs; ++i) {
-    const Value *V0 = VL0[i];
-    const Value *V1 = VL1[i];
-    const auto *Inst0 = dyn_cast<Instruction>(V0);
-    const auto *Inst1 = dyn_cast<Instruction>(V1);
-
-    LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
-               dbgs() << "mul1: "; V0->dump();
-               dbgs() << "mul2: "; V1->dump());
-
-    if (!Inst0 || !Inst1)
-      return false;
-
-    if (Inst0->isSameOperationAs(Inst1)) {
-      LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
-      continue;
-    }
-
-    const APInt *C0, *C1;
-    if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
-      return false;
-  }
-
-  LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
-  return true;
-}
-
 template<typename MemInst>
 static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
-                                  MemInstList &VecMem, const DataLayout &DL,
-                                  ScalarEvolution &SE) {
+                                  const DataLayout &DL, ScalarEvolution &SE) {
   if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
     LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
     return false;
   }
   if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
-    VecMem.clear();
-    VecMem.push_back(MemOp0);
-    VecMem.push_back(MemOp1);
     LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
     return true;
   }
@@ -328,16 +295,106 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
     return false;
   }
 
-  return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, *DL, *SE);
+  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
+    return false;
+
+  VecMem.clear();
+  VecMem.push_back(Ld0);
+  VecMem.push_back(Ld1);
+  return true;
+}
+
+/// Iterate through the block and record base, offset pairs of loads as well as
+/// maximal sequences of sequential loads.
+bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
+  SmallVector<LoadInst*, 8> Loads;
+  for (auto &I : *Header) {
+    auto *Ld = dyn_cast<LoadInst>(&I);
+    if (!Ld)
+      continue;
+    Loads.push_back(Ld);
+  }
+
+  std::map<LoadInst*, LoadInst*> BaseLoads;
+
+  for (auto *Ld0 : Loads) {
+    for (auto *Ld1 : Loads) {
+      if (Ld0 == Ld1)
+        continue;
+
+      if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
+        LoadPairs[Ld0] = Ld1;
+        if (BaseLoads.count(Ld0)) {
+          LoadInst *Base = BaseLoads[Ld0];
+          BaseLoads[Ld1] = Base;
+          SequentialLoads[Base].push_back(Ld1);
+        } else {
+          BaseLoads[Ld1] = Ld0;
+          SequentialLoads[Ld0].push_back(Ld1);
+        }
+      }
+    }
+  }
+  return LoadPairs.size() > 1;
 }
 
-PMACPairList
-ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
+void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
+  OpChainList &Candidates = R.MACCandidates;
+  PMACPairList &PMACPairs = R.PMACPairs;
   const unsigned Elems = Candidates.size();
-  PMACPairList PMACPairs;
 
   if (Elems < 2)
-    return PMACPairs;
+    return;
+
+  auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) {
+    if (!PMul0->AreSymmetrical(PMul1))
+      return false;
+
+    // The first elements of each vector should be loads with sexts. If we
+    // find that its two pairs of consecutive loads, then these can be
+    // transformed into two wider loads and the users can be replaced with
+    // DSP intrinsics.
+    for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) {
+      auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]);
+      auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]);
+      auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]);
+      auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]);
+
+      if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
+        return false;
+
+      LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
+                 << "\t Ld0: " << *Ld0 << "\n"
+                 << "\t Ld1: " << *Ld1 << "\n"
+                 << "and operands " << x + 2 << ":\n"
+                 << "\t Ld2: " << *Ld2 << "\n"
+                 << "\t Ld3: " << *Ld3 << "\n");
+
+      if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
+        if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+          LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+          PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+          return true;
+        } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
+          LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+          LLVM_DEBUG(dbgs() << "    exchanging Ld2 and Ld3\n");
+          PMul1->Exchange = true;
+          PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+          return true;
+        }
+      } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
+                 AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+        LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+        LLVM_DEBUG(dbgs() << "    exchanging Ld0 and Ld1\n");
+        LLVM_DEBUG(dbgs() << "    and swapping muls\n");
+        PMul0->Exchange = true;
+        // Only the second operand can be exchanged, so swap the muls.
+        PMACPairs.push_back(std::make_pair(PMul1, PMul0));
+        return true;
+      }
+    }
+    return false;
+  };
 
   SmallPtrSet<const Instruction*, 4> Paired;
   for (unsigned i = 0; i < Elems; ++i) {
@@ -364,77 +421,21 @@ ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
                  dbgs() << "- "; Mul0->dump();
                  dbgs() << "- "; Mul1->dump());
 
-      const ValueList &Mul0_LHS = PMul0->LHS;
-      const ValueList &Mul0_RHS = PMul0->RHS;
-      const ValueList &Mul1_LHS = PMul1->LHS;
-      const ValueList &Mul1_RHS = PMul1->RHS;
-
-      if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
-          !AreSymmetrical(Mul0_RHS, Mul1_RHS))
-        continue;
-
       LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
-      // The first elements of each vector should be loads with sexts. If we
-      // find that its two pairs of consecutive loads, then these can be
-      // transformed into two wider loads and the users can be replaced with
-      // DSP intrinsics.
-      bool Found = false;
-      for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) {
-        auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
-        auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
-        auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
-        auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
-
-        if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
-          continue;
-
-        LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
-                   << "\t Ld0: " << *Ld0 << "\n"
-                   << "\t Ld1: " << *Ld1 << "\n"
-                   << "and operands " << x + 2 << ":\n"
-                   << "\t Ld2: " << *Ld2 << "\n"
-                   << "\t Ld3: " << *Ld3 << "\n");
-
-        if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
-          if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
-            LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
-            PMACPairs.push_back(std::make_pair(PMul0, PMul1));
-            Found = true;
-          } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
-            LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
-            LLVM_DEBUG(dbgs() << "    exchanging Ld2 and Ld3\n");
-            PMul1->Exchange = true;
-            PMACPairs.push_back(std::make_pair(PMul0, PMul1));
-            Found = true;
-          }
-        } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd)) {
-          if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
-            LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
-            LLVM_DEBUG(dbgs() << "    exchanging Ld0 and Ld1\n");
-            LLVM_DEBUG(dbgs() << "    and swapping muls\n");
-            PMul0->Exchange = true;
-            // Only the second operand can be exchanged, so swap the muls.
-            PMACPairs.push_back(std::make_pair(PMul1, PMul0));
-            Found = true;
-          }
-        }
-      }
-      if (Found) {
+      if (CanPair(PMul0, PMul1)) {
         Paired.insert(Mul0);
         Paired.insert(Mul1);
         break;
       }
     }
   }
-  return PMACPairs;
 }
 
-bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
-                                        PMACPairList &PMACPairs) {
+bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
   Instruction *Acc = Reduction.Phi;
   Instruction *InsertAfter = Reduction.AccIntAdd;
 
-  for (auto &Pair : PMACPairs) {
+  for (auto &Pair : Reduction.PMACPairs) {
     BinOpChain *PMul0 = Pair.first;
     BinOpChain *PMul1 = Pair.second;
     LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
@@ -685,8 +686,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
   for (auto &R : Reductions) {
     if (AreAliased(AA, Reads, Writes, R.MACCandidates))
       return false;
-    PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
-    Changed |= InsertParallelMACs(R, PMACPairs);
+    CreateParallelMACPairs(R);
+    Changed |= InsertParallelMACs(R);
   }
 
   LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
@@ -733,6 +734,52 @@ Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
   return Call;
 }
 
+// Compare the value lists in Other to this chain.
+bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
+  // Element-by-element comparison of Value lists returning true if they are
+  // instructions with the same opcode or constants with the same value.
+  auto CompareValueList = [](const ValueList &VL0,
+                             const ValueList &VL1) {
+    if (VL0.size() != VL1.size()) {
+      LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
+                        << VL0.size() << " != " << VL1.size() << "\n");
+      return false;
+    }
+
+    const unsigned Pairs = VL0.size();
+    LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
+
+    for (unsigned i = 0; i < Pairs; ++i) {
+      const Value *V0 = VL0[i];
+      const Value *V1 = VL1[i];
+      const auto *Inst0 = dyn_cast<Instruction>(V0);
+      const auto *Inst1 = dyn_cast<Instruction>(V1);
+
+      LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
+                dbgs() << "mul1: "; V0->dump();
+                dbgs() << "mul2: "; V1->dump());
+
+      if (!Inst0 || !Inst1)
+        return false;
+
+      if (Inst0->isSameOperationAs(Inst1)) {
+        LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
+        continue;
+      }
+
+      const APInt *C0, *C1;
+      if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
+        return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
+    return true;
+  };
+
+  return CompareValueList(LHS, Other->LHS) &&
+         CompareValueList(RHS, Other->RHS);
+}
+
 Pass *llvm::createARMParallelDSPPass() {
   return new ARMParallelDSP();
 }