[LoopInterchange] Stop performing unprofitable interchange

kasuga-fj · kasuga-fj · commit 45a837364233 · 2025-02-18T07:39:54.000Z
LoopInterchange uses the bubble-sort fashion algorithm to sort the
loops, but the comparison function (called isProfitable) didn't satisfy
asymmetry. This means that both isProfitable(a, b) and isProfitable(b,
a) can return true, triggering an unprofitable interchange. This patch
fixes the problem and prevents the interchange from performing
unprofitable transformations.
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -356,26 +356,25 @@ class LoopInterchangeLegality {
   SmallVector<PHINode *, 8> InnerLoopInductions;
 };
 
+using CostMapTy = DenseMap<const Loop *, std::pair<unsigned, CacheCostTy>>;
+
 /// LoopInterchangeProfitability checks if it is profitable to interchange the
 /// loop.
 class LoopInterchangeProfitability {
 public:
   LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
-                               OptimizationRemarkEmitter *ORE)
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
+                               OptimizationRemarkEmitter *ORE,
+                               const std::optional<CostMapTy> &CM)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE), CostMap(CM) {}
 
   /// Check if the loop interchange is profitable.
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
-                    CharMatrix &DepMatrix,
-                    const DenseMap<const Loop *, unsigned> &CostMap,
-                    std::unique_ptr<CacheCost> &CC);
+                    CharMatrix &DepMatrix);
 
 private:
   int getInstrOrderCost();
-  std::optional<bool> isProfitablePerLoopCacheAnalysis(
-      const DenseMap<const Loop *, unsigned> &CostMap,
-      std::unique_ptr<CacheCost> &CC);
+  std::optional<bool> isProfitablePerLoopCacheAnalysis();
   std::optional<bool> isProfitablePerInstrOrderCost();
   std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
                                                    unsigned OuterLoopId,
@@ -388,6 +387,8 @@ class LoopInterchangeProfitability {
 
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
+
+  const std::optional<CostMapTy> &CostMap;
 };
 
 /// LoopInterchangeTransform interchanges the loop.
@@ -497,11 +498,13 @@ struct LoopInterchange {
     // indicates the loop should be placed as the innermost loop.
     //
     // For the old pass manager CacheCost would be null.
-    DenseMap<const Loop *, unsigned> CostMap;
+    std::optional<CostMapTy> CostMap = std::nullopt;
     if (CC != nullptr) {
+      CostMap = CostMapTy();
       const auto &LoopCosts = CC->getLoopCosts();
       for (unsigned i = 0; i < LoopCosts.size(); i++) {
-        CostMap[LoopCosts[i].first] = i;
+        const auto &Cost = LoopCosts[i];
+        (*CostMap)[Cost.first] = std::make_pair(i, Cost.second);
       }
     }
     // We try to achieve the globally optimal memory access for the loopnest,
@@ -537,7 +540,7 @@ struct LoopInterchange {
   bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
                    unsigned OuterLoopId,
                    std::vector<std::vector<char>> &DependencyMatrix,
-                   const DenseMap<const Loop *, unsigned> &CostMap) {
+                   const std::optional<CostMapTy> &CostMap) {
     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
                       << " and OuterLoopId = " << OuterLoopId << "\n");
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
@@ -546,9 +549,9 @@ struct LoopInterchange {
       return false;
     }
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
-    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE, CostMap);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, CostMap, CC)) {
+                          DependencyMatrix)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1127,29 +1130,60 @@ int LoopInterchangeProfitability::getInstrOrderCost() {
 }
 
 std::optional<bool>
-LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
-    const DenseMap<const Loop *, unsigned> &CostMap,
-    std::unique_ptr<CacheCost> &CC) {
+LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis() {
   // This is the new cost model returned from loop cache analysis.
   // A smaller index means the loop should be placed an outer loop, and vice
   // versa.
-  if (CostMap.contains(InnerLoop) && CostMap.contains(OuterLoop)) {
-    unsigned InnerIndex = 0, OuterIndex = 0;
-    InnerIndex = CostMap.find(InnerLoop)->second;
-    OuterIndex = CostMap.find(OuterLoop)->second;
-    LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
-                      << ", OuterIndex = " << OuterIndex << "\n");
-    if (InnerIndex < OuterIndex)
-      return std::optional<bool>(true);
-    assert(InnerIndex != OuterIndex && "CostMap should assign unique "
-                                       "numbers to each loop");
-    if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
-      return std::nullopt;
-    return std::optional<bool>(false);
-  }
-  return std::nullopt;
+  if (!CostMap.has_value())
+    return std::nullopt;
+
+  auto InnerIte = CostMap->find(InnerLoop);
+  auto OuterIte = CostMap->find(OuterLoop);
+  if (InnerIte == CostMap->end() || OuterIte == CostMap->end())
+    return std::nullopt;
+
+  const auto &[InnerIndex, InnerCost] = InnerIte->second;
+  const auto &[OuterIndex, OuterCost] = OuterIte->second;
+  LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
+                    << ", OuterIndex = " << OuterIndex << "\n");
+  assert(InnerIndex != OuterIndex && "CostMap should assign unique "
+                                     "numbers to each loop");
+
+  if (InnerCost == OuterCost)
+    return std::nullopt;
+
+  return InnerIndex < OuterIndex;
 }
 
+// This function doesn't satisfy transitivity. Consider the following case.
+//
+// ```
+// for (int k = 0; k < N; k++) {
+//   for (int j = 0; j < N; j++) {
+//     for (int i = 0; i < N; i++) {
+//       dst0[i][j][k] += aa[i][j] + bb[i][j] + cc[j][k];
+//       dst1[k][j][i] += dd[i][j] + ee[i][j] + ff[j][k];
+//     }
+//   }
+// }
+//
+// ```
+//
+// The getInstrOrderCost will return the following value.
+//
+//  Outer | Inner | Cost
+// -------+-------+------
+//    k   |   j   |  -2
+//    j   |   i   |  -4
+//    k   |   i   |   0
+//
+// This means that this function says interchanging (k, j) loops and (j, i)
+// loops are profitable, but not (k, i). The root cause of this is that the
+// getInstrOrderCost only see the loops we are checking. We can resolve this if
+// we also consider the order going through other inductions. As for the above
+// case, we can induce that interchanging `k` and `i` is profitable (it is
+// better to move the `k` loop to inner position) by `bb[i][j]` and `cc[j][k]`.
+// However, such accurate calculation is expensive, so that we don't do it.
 std::optional<bool>
 LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
   // Legacy cost model: this is rough cost estimation algorithm. It counts the
@@ -1184,11 +1218,27 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
   return std::optional<bool>(!DepMatrix.empty());
 }
 
-bool LoopInterchangeProfitability::isProfitable(
-    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
-    unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const DenseMap<const Loop *, unsigned> &CostMap,
-    std::unique_ptr<CacheCost> &CC) {
+// The bubble-sort fashion algorithm is adopted to sort the loop nest, so the
+// comparison function should ideally induce a strict weak ordering required by
+// some standard C++ libraries. In particular, isProfitable should hold the
+// following properties.
+//
+// Asymmetry: If isProfitable(a, b) is true then isProfitable(b, a) is false.
+// Transitivity: If both isProfitable(a, b) and isProfitable(b, c) is true then
+// isProfitable(a, c) is true.
+//
+// The most important thing is not to make unprofitable interchange. From this
+// point of view, asymmetry is important. This is because if both
+// isProfitable(a, b) and isProfitable(b, a) are true, then an unprofitable
+// transformation (one of them) will be performed. On the other hand, a lack of
+// transitivity might cause some optimization opportunities to be lost, but
+// won't trigger an unprofitable one. Moreover, guaranteeing transitivity is
+// expensive. Therefore, isProfitable only holds the asymmetry.
+bool LoopInterchangeProfitability::isProfitable(const Loop *InnerLoop,
+                                                const Loop *OuterLoop,
+                                                unsigned InnerLoopId,
+                                                unsigned OuterLoopId,
+                                                CharMatrix &DepMatrix) {
   // isProfitable() is structured to avoid endless loop interchange.
   // If loop cache analysis could decide the profitability then,
   // profitability check will stop and return the analysis result.
@@ -1197,15 +1247,14 @@ bool LoopInterchangeProfitability::isProfitable(
   // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
   // analysis the profitability then only, isProfitableForVectorization
   // will decide.
-  std::optional<bool> shouldInterchange =
-      isProfitablePerLoopCacheAnalysis(CostMap, CC);
-  if (!shouldInterchange.has_value()) {
-    shouldInterchange = isProfitablePerInstrOrderCost();
-    if (!shouldInterchange.has_value())
-      shouldInterchange =
+  std::optional<bool> ShouldInterchange = isProfitablePerLoopCacheAnalysis();
+  if (!ShouldInterchange.has_value()) {
+    ShouldInterchange = isProfitablePerInstrOrderCost();
+    if (!ShouldInterchange.has_value())
+      ShouldInterchange =
           isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
   }
-  if (!shouldInterchange.has_value()) {
+  if (!ShouldInterchange.has_value()) {
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
                                       InnerLoop->getStartLoc(),
@@ -1214,7 +1263,8 @@ bool LoopInterchangeProfitability::isProfitable(
                 "interchange.";
     });
     return false;
-  } else if (!shouldInterchange.value()) {
+  }
+  if (!ShouldInterchange.value()) {
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
                                       InnerLoop->getStartLoc(),
diff --git a/llvm/test/Transforms/LoopInterchange/profitability-redundant-interchange.ll b/llvm/test/Transforms/LoopInterchange/profitability-redundant-interchange.ll
@@ -0,0 +1,80 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=1 -pass-remarks-output=%t -disable-output \
+; RUN:      -verify-dom-info -verify-loop-info
+; RUN: FileCheck -input-file %t %s
+
+
+; Test that the same pair of loops are not interchanged twice. This is the case
+; when the cost computed by CacheCost is the same for the loop of `j` and `k`.
+;
+; #define N 4
+; int a[N*N][N*N][N*N];
+; void f() {
+;   for (int i = 0; i < N; i++)
+;     for (int j = 1; j < 2*N; j++)
+;       for (int k = 1; k < 2*N; k++)
+;         a[i][k+1][j-1] -= a[i+N-1][k][j];
+; }
+
+; CHECK:      --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        f
+; CHECK-NEXT: Args:
+; CHECK-NEXT:    - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT: ...
+; CHECK-NEXT: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Dependence
+; CHECK-NEXT: Function:        f
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:       Cannot interchange loops due to dependences.
+; CHECK-NEXT: ...
+; CHECK-NEXT: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        f
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; CHECK-NEXT: ...
+
+@a = dso_local local_unnamed_addr global [16 x [16 x [16 x i32]]] zeroinitializer, align 4
+
+define dso_local void @f() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv46 = phi i64 [ 0, %entry ], [ %indvars.iv.next47, %for.cond.cleanup3 ]
+  %0 = add nuw nsw i64 %indvars.iv46, 3
+  br label %for.cond5.preheader
+
+for.cond5.preheader:
+  %indvars.iv41 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next42, %for.cond.cleanup7 ]
+  %1 = add nsw i64 %indvars.iv41, -1
+  br label %for.body8
+
+for.cond.cleanup3:
+  %indvars.iv.next47 = add nuw nsw i64 %indvars.iv46, 1
+  %exitcond50 = icmp ne i64 %indvars.iv.next47, 4
+  br i1 %exitcond50, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond.cleanup7:
+  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+  %exitcond45 = icmp ne i64 %indvars.iv.next42, 8
+  br i1 %exitcond45, label %for.cond5.preheader, label %for.cond.cleanup3
+
+for.body8:
+  %indvars.iv = phi i64 [ 1, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
+  %arrayidx12 = getelementptr inbounds nuw [16 x [16 x [16 x i32]]], ptr @a, i64 0, i64 %0, i64 %indvars.iv, i64 %indvars.iv41
+  %2 = load i32, ptr %arrayidx12, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx20 = getelementptr inbounds [16 x [16 x [16 x i32]]], ptr @a, i64 0, i64 %indvars.iv46, i64 %indvars.iv.next, i64 %1
+  %3 = load i32, ptr %arrayidx20, align 4
+  %sub21 = sub nsw i32 %3, %2
+  store i32 %sub21, ptr %arrayidx20, align 4
+  %exitcond = icmp ne i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.body8, label %for.cond.cleanup7
+
+for.cond.cleanup:
+  ret void
+}