Skip to content

Commit 45a8373

Browse files
committed
[LoopInterchange] Stop performing unprofitable interchange
LoopInterchange uses the bubble-sort fashion algorithm to sort the loops, but the comparison function (called isProfitable) didn't satisfy asymmetry. This means that both isProfitable(a, b) and isProfitable(b, a) can return true, triggering an unprofitable interchange. This patch fixes the problem and prevents the interchange from performing unprofitable transformations.
1 parent 8a0914c commit 45a8373

File tree

2 files changed

+174
-44
lines changed

2 files changed

+174
-44
lines changed

llvm/lib/Transforms/Scalar/LoopInterchange.cpp

Lines changed: 94 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -356,26 +356,25 @@ class LoopInterchangeLegality {
356356
SmallVector<PHINode *, 8> InnerLoopInductions;
357357
};
358358

359+
using CostMapTy = DenseMap<const Loop *, std::pair<unsigned, CacheCostTy>>;
360+
359361
/// LoopInterchangeProfitability checks if it is profitable to interchange the
360362
/// loop.
361363
class LoopInterchangeProfitability {
362364
public:
363365
LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
364-
OptimizationRemarkEmitter *ORE)
365-
: OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
366+
OptimizationRemarkEmitter *ORE,
367+
const std::optional<CostMapTy> &CM)
368+
: OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE), CostMap(CM) {}
366369

367370
/// Check if the loop interchange is profitable.
368371
bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
369372
unsigned InnerLoopId, unsigned OuterLoopId,
370-
CharMatrix &DepMatrix,
371-
const DenseMap<const Loop *, unsigned> &CostMap,
372-
std::unique_ptr<CacheCost> &CC);
373+
CharMatrix &DepMatrix);
373374

374375
private:
375376
int getInstrOrderCost();
376-
std::optional<bool> isProfitablePerLoopCacheAnalysis(
377-
const DenseMap<const Loop *, unsigned> &CostMap,
378-
std::unique_ptr<CacheCost> &CC);
377+
std::optional<bool> isProfitablePerLoopCacheAnalysis();
379378
std::optional<bool> isProfitablePerInstrOrderCost();
380379
std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
381380
unsigned OuterLoopId,
@@ -388,6 +387,8 @@ class LoopInterchangeProfitability {
388387

389388
/// Interface to emit optimization remarks.
390389
OptimizationRemarkEmitter *ORE;
390+
391+
const std::optional<CostMapTy> &CostMap;
391392
};
392393

393394
/// LoopInterchangeTransform interchanges the loop.
@@ -497,11 +498,13 @@ struct LoopInterchange {
497498
// indicates the loop should be placed as the innermost loop.
498499
//
499500
// For the old pass manager CacheCost would be null.
500-
DenseMap<const Loop *, unsigned> CostMap;
501+
std::optional<CostMapTy> CostMap = std::nullopt;
501502
if (CC != nullptr) {
503+
CostMap = CostMapTy();
502504
const auto &LoopCosts = CC->getLoopCosts();
503505
for (unsigned i = 0; i < LoopCosts.size(); i++) {
504-
CostMap[LoopCosts[i].first] = i;
506+
const auto &Cost = LoopCosts[i];
507+
(*CostMap)[Cost.first] = std::make_pair(i, Cost.second);
505508
}
506509
}
507510
// We try to achieve the globally optimal memory access for the loopnest,
@@ -537,7 +540,7 @@ struct LoopInterchange {
537540
bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
538541
unsigned OuterLoopId,
539542
std::vector<std::vector<char>> &DependencyMatrix,
540-
const DenseMap<const Loop *, unsigned> &CostMap) {
543+
const std::optional<CostMapTy> &CostMap) {
541544
LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
542545
<< " and OuterLoopId = " << OuterLoopId << "\n");
543546
LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
@@ -546,9 +549,9 @@ struct LoopInterchange {
546549
return false;
547550
}
548551
LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
549-
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
552+
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE, CostMap);
550553
if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
551-
DependencyMatrix, CostMap, CC)) {
554+
DependencyMatrix)) {
552555
LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
553556
return false;
554557
}
@@ -1127,29 +1130,60 @@ int LoopInterchangeProfitability::getInstrOrderCost() {
11271130
}
11281131

11291132
std::optional<bool>
1130-
LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1131-
const DenseMap<const Loop *, unsigned> &CostMap,
1132-
std::unique_ptr<CacheCost> &CC) {
1133+
LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis() {
11331134
// This is the new cost model returned from loop cache analysis.
11341135
// A smaller index means the loop should be placed an outer loop, and vice
11351136
// versa.
1136-
if (CostMap.contains(InnerLoop) && CostMap.contains(OuterLoop)) {
1137-
unsigned InnerIndex = 0, OuterIndex = 0;
1138-
InnerIndex = CostMap.find(InnerLoop)->second;
1139-
OuterIndex = CostMap.find(OuterLoop)->second;
1140-
LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1141-
<< ", OuterIndex = " << OuterIndex << "\n");
1142-
if (InnerIndex < OuterIndex)
1143-
return std::optional<bool>(true);
1144-
assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1145-
"numbers to each loop");
1146-
if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1147-
return std::nullopt;
1148-
return std::optional<bool>(false);
1149-
}
1150-
return std::nullopt;
1137+
if (!CostMap.has_value())
1138+
return std::nullopt;
1139+
1140+
auto InnerIte = CostMap->find(InnerLoop);
1141+
auto OuterIte = CostMap->find(OuterLoop);
1142+
if (InnerIte == CostMap->end() || OuterIte == CostMap->end())
1143+
return std::nullopt;
1144+
1145+
const auto &[InnerIndex, InnerCost] = InnerIte->second;
1146+
const auto &[OuterIndex, OuterCost] = OuterIte->second;
1147+
LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1148+
<< ", OuterIndex = " << OuterIndex << "\n");
1149+
assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1150+
"numbers to each loop");
1151+
1152+
if (InnerCost == OuterCost)
1153+
return std::nullopt;
1154+
1155+
return InnerIndex < OuterIndex;
11511156
}
11521157

1158+
// This function doesn't satisfy transitivity. Consider the following case.
1159+
//
1160+
// ```
1161+
// for (int k = 0; k < N; k++) {
1162+
// for (int j = 0; j < N; j++) {
1163+
// for (int i = 0; i < N; i++) {
1164+
// dst0[i][j][k] += aa[i][j] + bb[i][j] + cc[j][k];
1165+
// dst1[k][j][i] += dd[i][j] + ee[i][j] + ff[j][k];
1166+
// }
1167+
// }
1168+
// }
1169+
//
1170+
// ```
1171+
//
1172+
// The getInstrOrderCost will return the following value.
1173+
//
1174+
// Outer | Inner | Cost
1175+
// -------+-------+------
1176+
// k | j | -2
1177+
// j | i | -4
1178+
// k | i | 0
1179+
//
1180+
// This means that this function says interchanging (k, j) loops and (j, i)
1181+
// loops are profitable, but not (k, i). The root cause of this is that the
1182+
// getInstrOrderCost only see the loops we are checking. We can resolve this if
1183+
// we also consider the order going through other inductions. As for the above
1184+
// case, we can induce that interchanging `k` and `i` is profitable (it is
1185+
// better to move the `k` loop to inner position) by `bb[i][j]` and `cc[j][k]`.
1186+
// However, such accurate calculation is expensive, so that we don't do it.
11531187
std::optional<bool>
11541188
LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
11551189
// Legacy cost model: this is rough cost estimation algorithm. It counts the
@@ -1184,11 +1218,27 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
11841218
return std::optional<bool>(!DepMatrix.empty());
11851219
}
11861220

1187-
bool LoopInterchangeProfitability::isProfitable(
1188-
const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1189-
unsigned OuterLoopId, CharMatrix &DepMatrix,
1190-
const DenseMap<const Loop *, unsigned> &CostMap,
1191-
std::unique_ptr<CacheCost> &CC) {
1221+
// The bubble-sort fashion algorithm is adopted to sort the loop nest, so the
1222+
// comparison function should ideally induce a strict weak ordering required by
1223+
// some standard C++ libraries. In particular, isProfitable should hold the
1224+
// following properties.
1225+
//
1226+
// Asymmetry: If isProfitable(a, b) is true then isProfitable(b, a) is false.
1227+
// Transitivity: If both isProfitable(a, b) and isProfitable(b, c) is true then
1228+
// isProfitable(a, c) is true.
1229+
//
1230+
// The most important thing is not to make unprofitable interchange. From this
1231+
// point of view, asymmetry is important. This is because if both
1232+
// isProfitable(a, b) and isProfitable(b, a) are true, then an unprofitable
1233+
// transformation (one of them) will be performed. On the other hand, a lack of
1234+
// transitivity might cause some optimization opportunities to be lost, but
1235+
// won't trigger an unprofitable one. Moreover, guaranteeing transitivity is
1236+
// expensive. Therefore, isProfitable only holds the asymmetry.
1237+
bool LoopInterchangeProfitability::isProfitable(const Loop *InnerLoop,
1238+
const Loop *OuterLoop,
1239+
unsigned InnerLoopId,
1240+
unsigned OuterLoopId,
1241+
CharMatrix &DepMatrix) {
11921242
// isProfitable() is structured to avoid endless loop interchange.
11931243
// If loop cache analysis could decide the profitability then,
11941244
// profitability check will stop and return the analysis result.
@@ -1197,15 +1247,14 @@ bool LoopInterchangeProfitability::isProfitable(
11971247
// profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
11981248
// analysis the profitability then only, isProfitableForVectorization
11991249
// will decide.
1200-
std::optional<bool> shouldInterchange =
1201-
isProfitablePerLoopCacheAnalysis(CostMap, CC);
1202-
if (!shouldInterchange.has_value()) {
1203-
shouldInterchange = isProfitablePerInstrOrderCost();
1204-
if (!shouldInterchange.has_value())
1205-
shouldInterchange =
1250+
std::optional<bool> ShouldInterchange = isProfitablePerLoopCacheAnalysis();
1251+
if (!ShouldInterchange.has_value()) {
1252+
ShouldInterchange = isProfitablePerInstrOrderCost();
1253+
if (!ShouldInterchange.has_value())
1254+
ShouldInterchange =
12061255
isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
12071256
}
1208-
if (!shouldInterchange.has_value()) {
1257+
if (!ShouldInterchange.has_value()) {
12091258
ORE->emit([&]() {
12101259
return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
12111260
InnerLoop->getStartLoc(),
@@ -1214,7 +1263,8 @@ bool LoopInterchangeProfitability::isProfitable(
12141263
"interchange.";
12151264
});
12161265
return false;
1217-
} else if (!shouldInterchange.value()) {
1266+
}
1267+
if (!ShouldInterchange.value()) {
12181268
ORE->emit([&]() {
12191269
return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
12201270
InnerLoop->getStartLoc(),
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
; RUN: opt < %s -passes=loop-interchange -cache-line-size=1 -pass-remarks-output=%t -disable-output \
2+
; RUN: -verify-dom-info -verify-loop-info
3+
; RUN: FileCheck -input-file %t %s
4+
5+
6+
; Test that the same pair of loops are not interchanged twice. This is the case
7+
; when the cost computed by CacheCost is the same for the loop of `j` and `k`.
8+
;
9+
; #define N 4
10+
; int a[N*N][N*N][N*N];
11+
; void f() {
12+
; for (int i = 0; i < N; i++)
13+
; for (int j = 1; j < 2*N; j++)
14+
; for (int k = 1; k < 2*N; k++)
15+
; a[i][k+1][j-1] -= a[i+N-1][k][j];
16+
; }
17+
18+
; CHECK: --- !Passed
19+
; CHECK-NEXT: Pass: loop-interchange
20+
; CHECK-NEXT: Name: Interchanged
21+
; CHECK-NEXT: Function: f
22+
; CHECK-NEXT: Args:
23+
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
24+
; CHECK-NEXT: ...
25+
; CHECK-NEXT: --- !Missed
26+
; CHECK-NEXT: Pass: loop-interchange
27+
; CHECK-NEXT: Name: Dependence
28+
; CHECK-NEXT: Function: f
29+
; CHECK-NEXT: Args:
30+
; CHECK-NEXT: - String: Cannot interchange loops due to dependences.
31+
; CHECK-NEXT: ...
32+
; CHECK-NEXT: --- !Missed
33+
; CHECK-NEXT: Pass: loop-interchange
34+
; CHECK-NEXT: Name: InterchangeNotProfitable
35+
; CHECK-NEXT: Function: f
36+
; CHECK-NEXT: Args:
37+
; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
38+
; CHECK-NEXT: ...
39+
40+
@a = dso_local local_unnamed_addr global [16 x [16 x [16 x i32]]] zeroinitializer, align 4
41+
42+
define dso_local void @f() {
43+
entry:
44+
br label %for.cond1.preheader
45+
46+
for.cond1.preheader:
47+
%indvars.iv46 = phi i64 [ 0, %entry ], [ %indvars.iv.next47, %for.cond.cleanup3 ]
48+
%0 = add nuw nsw i64 %indvars.iv46, 3
49+
br label %for.cond5.preheader
50+
51+
for.cond5.preheader:
52+
%indvars.iv41 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next42, %for.cond.cleanup7 ]
53+
%1 = add nsw i64 %indvars.iv41, -1
54+
br label %for.body8
55+
56+
for.cond.cleanup3:
57+
%indvars.iv.next47 = add nuw nsw i64 %indvars.iv46, 1
58+
%exitcond50 = icmp ne i64 %indvars.iv.next47, 4
59+
br i1 %exitcond50, label %for.cond1.preheader, label %for.cond.cleanup
60+
61+
for.cond.cleanup7:
62+
%indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
63+
%exitcond45 = icmp ne i64 %indvars.iv.next42, 8
64+
br i1 %exitcond45, label %for.cond5.preheader, label %for.cond.cleanup3
65+
66+
for.body8:
67+
%indvars.iv = phi i64 [ 1, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
68+
%arrayidx12 = getelementptr inbounds nuw [16 x [16 x [16 x i32]]], ptr @a, i64 0, i64 %0, i64 %indvars.iv, i64 %indvars.iv41
69+
%2 = load i32, ptr %arrayidx12, align 4
70+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
71+
%arrayidx20 = getelementptr inbounds [16 x [16 x [16 x i32]]], ptr @a, i64 0, i64 %indvars.iv46, i64 %indvars.iv.next, i64 %1
72+
%3 = load i32, ptr %arrayidx20, align 4
73+
%sub21 = sub nsw i32 %3, %2
74+
store i32 %sub21, ptr %arrayidx20, align 4
75+
%exitcond = icmp ne i64 %indvars.iv.next, 8
76+
br i1 %exitcond, label %for.body8, label %for.cond.cleanup7
77+
78+
for.cond.cleanup:
79+
ret void
80+
}

0 commit comments

Comments
 (0)