Skip to content

Commit 2e3567d

Browse files
committed
[LoopInterchange] Constrain load/stores in a loop
In the current state of the code, the transform computes entries for the dependency matrix until MaxMemInstrCount which is 100. After 99th entry it terminates and thus overall wastes compile-time. It would be nice if we can compute total number of entries upfront and early exit if the number of entries > 100. However, computing the number of entries is not always possible as it depends on two factors: 1. Number of load-store pairs in a loop. 2. Number of common loop levels for each of the pair. This patch just constrains the whole computation on the number of loads and store instructions in the loop. With 64, I see 39 interchanges compared to 42 with the trunk. (With 128, it is 42 vs 43 but I see increase in the compile-time.) In an other approach, I experimented with computing 1 and constraining the number of pairs but that did not lead to any additional benefit in terms of compile-time. However, when other issues are fixed, I can revisit this approach.
1 parent 7dd34ba commit 2e3567d

File tree

2 files changed

+289
-13
lines changed

2 files changed

+289
-13
lines changed

llvm/lib/Transforms/Scalar/LoopInterchange.cpp

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,14 @@ static cl::opt<int> LoopInterchangeCostThreshold(
5757
"loop-interchange-threshold", cl::init(0), cl::Hidden,
5858
cl::desc("Interchange if you gain more than this number"));
5959

60+
// Maximum number of load-stores that can be handled in the dependency matrix.
61+
static cl::opt<unsigned int> MaxMemInstrCount(
62+
"loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden,
63+
cl::desc(
64+
"Maximum number of load-store instructions that should be handled "
65+
"in the dependency matrix. Higher value may lead to more interchanges "
66+
"at the cost of compile-time"));
67+
6068
namespace {
6169

6270
using LoopVector = SmallVector<Loop *, 8>;
@@ -66,9 +74,6 @@ using CharMatrix = std::vector<std::vector<char>>;
6674

6775
} // end anonymous namespace
6876

69-
// Maximum number of dependencies that can be handled in the dependency matrix.
70-
static const unsigned MaxMemInstrCount = 100;
71-
7277
// Maximum loop depth supported.
7378
static const unsigned MaxLoopNestDepth = 10;
7479

@@ -84,7 +89,8 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
8489

8590
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
8691
Loop *L, DependenceInfo *DI,
87-
ScalarEvolution *SE) {
92+
ScalarEvolution *SE,
93+
OptimizationRemarkEmitter *ORE) {
8894
using ValueVector = SmallVector<Value *, 16>;
8995

9096
ValueVector MemInstr;
@@ -109,7 +115,18 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
109115

110116
LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
111117
<< " Loads and Stores to analyze\n");
112-
118+
if (MemInstr.size() > MaxMemInstrCount) {
119+
LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
120+
<< MaxMemInstrCount << " load/stores in a loop\n");
121+
ORE->emit([&]() {
122+
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
123+
L->getStartLoc(), L->getHeader())
124+
<< "Number of loads/stores exceeded, the supported maximum "
125+
"can be increased with option "
126+
"-loop-interchange-maxmeminstr-count.";
127+
});
128+
return false;
129+
}
113130
ValueVector::iterator I, IE, J, JE;
114131
StringSet<> Seen;
115132

@@ -161,12 +178,6 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
161178
// Make sure we only add unique entries to the dependency matrix.
162179
if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
163180
DepMatrix.push_back(Dep);
164-
165-
if (DepMatrix.size() > MaxMemInstrCount) {
166-
LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
167-
<< " dependencies inside loop\n");
168-
return false;
169-
}
170181
}
171182
}
172183
}
@@ -450,7 +461,7 @@ struct LoopInterchange {
450461
CharMatrix DependencyMatrix;
451462
Loop *OuterMostLoop = *(LoopList.begin());
452463
if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
453-
OuterMostLoop, DI, SE)) {
464+
OuterMostLoop, DI, SE, ORE)) {
454465
LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
455466
return false;
456467
}
@@ -1725,10 +1736,15 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
17251736
LPMUpdater &U) {
17261737
Function &F = *LN.getParent();
17271738
SmallVector<Loop *, 8> LoopList(LN.getLoops());
1739+
1740+
if (MaxMemInstrCount < 1) {
1741+
LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
1742+
return PreservedAnalyses::all();
1743+
}
1744+
17281745
// Ensure minimum depth of the loop nest to do the interchange.
17291746
if (!hasMinimumLoopDepth(LoopList))
17301747
return PreservedAnalyses::all();
1731-
17321748
DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
17331749
std::unique_ptr<CacheCost> CC =
17341750
CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
; RUN: opt < %s -passes=loop-interchange --pass-remarks-missed=loop-interchange -disable-output 2>&1 | FileCheck %s
2+
; RUN: opt < %s -passes=loop-interchange --pass-remarks-missed=loop-interchange -loop-interchange-max-meminstr-count=75
3+
; -disable-output 2>&1 | FileCheck --check-prefix=CHECK-INSTR-COUNT %s
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
@A = dso_local local_unnamed_addr global [2048 x [2048 x i32]] zeroinitializer, align 4
7+
@B = dso_local local_unnamed_addr global [2048 x [2048 x i32]] zeroinitializer, align 4
8+
@C = dso_local local_unnamed_addr global [2048 x [2048 x i32]] zeroinitializer, align 4
9+
10+
; CHECK: Number of loads/stores exceeded, the supported maximum
11+
; can be increased with option -loop-interchange-maxmeminstr-count.
12+
; CHECK-INSTR-COUNT-NOT: Number of loads/stores exceeded, the supported maximum
13+
; can be increased with option -loop-interchange-maxmeminstr-count.
14+
define dso_local noundef i32 @many_load_stores() {
15+
br label %1
16+
17+
1: ; preds = %9, %0
18+
%2 = phi i32 [ 0, %0 ], [ %10, %9 ]
19+
%3 = icmp slt i32 %2, 2048
20+
br i1 %3, label %5, label %4
21+
22+
4: ; preds = %1
23+
ret i32 0
24+
25+
5: ; preds = %1
26+
br label %6
27+
28+
6: ; preds = %11, %5
29+
%7 = phi i32 [ 0, %5 ], [ %208, %11 ]
30+
%8 = icmp slt i32 %7, 85
31+
br i1 %8, label %11, label %9
32+
33+
9: ; preds = %6
34+
%10 = add nsw i32 %2, 1
35+
br label %1
36+
37+
11: ; preds = %6
38+
%12 = sext i32 %2 to i64
39+
%13 = getelementptr inbounds [2048 x [2048 x i32]], [2048 x [2048 x i32]]* @B, i64 0, i64 %12
40+
%14 = sext i32 %7 to i64
41+
%15 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %14
42+
%16 = load i32, i32* %15, align 4
43+
%17 = getelementptr inbounds [2048 x [2048 x i32]], [2048 x [2048 x i32]]* @C, i64 0, i64 %12
44+
%18 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %14
45+
%19 = load i32, i32* %18, align 4
46+
%20 = add nsw i32 %16, %19
47+
%21 = getelementptr inbounds [2048 x [2048 x i32]], [2048 x [2048 x i32]]* @A, i64 0, i64 %12
48+
%22 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %14
49+
store i32 %20, i32* %22, align 4
50+
%23 = add nsw i32 %7, 1
51+
%24 = sext i32 %23 to i64
52+
%25 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %24
53+
%26 = load i32, i32* %25, align 4
54+
%27 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %24
55+
%28 = load i32, i32* %27, align 4
56+
%29 = add nsw i32 %26, %28
57+
%30 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %24
58+
store i32 %29, i32* %30, align 4
59+
%31 = add nsw i32 %23, 1
60+
%32 = sext i32 %31 to i64
61+
%33 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %32
62+
%34 = load i32, i32* %33, align 4
63+
%35 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %32
64+
%36 = load i32, i32* %35, align 4
65+
%37 = add nsw i32 %34, %36
66+
%38 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %32
67+
store i32 %37, i32* %38, align 4
68+
%39 = add nsw i32 %31, 1
69+
%40 = sext i32 %39 to i64
70+
%41 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %40
71+
%42 = load i32, i32* %41, align 4
72+
%43 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %40
73+
%44 = load i32, i32* %43, align 4
74+
%45 = add nsw i32 %42, %44
75+
%46 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %40
76+
store i32 %45, i32* %46, align 4
77+
%47 = add nsw i32 %39, 1
78+
%48 = sext i32 %47 to i64
79+
%49 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %48
80+
%50 = load i32, i32* %49, align 4
81+
%51 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %48
82+
%52 = load i32, i32* %51, align 4
83+
%53 = add nsw i32 %50, %52
84+
%54 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %48
85+
store i32 %53, i32* %54, align 4
86+
%55 = add nsw i32 %47, 1
87+
%56 = sext i32 %55 to i64
88+
%57 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %56
89+
%58 = load i32, i32* %57, align 4
90+
%59 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %56
91+
%60 = load i32, i32* %59, align 4
92+
%61 = add nsw i32 %58, %60
93+
%62 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %56
94+
store i32 %61, i32* %62, align 4
95+
%63 = add nsw i32 %55, 1
96+
%64 = sext i32 %63 to i64
97+
%65 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %64
98+
%66 = load i32, i32* %65, align 4
99+
%67 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %64
100+
%68 = load i32, i32* %67, align 4
101+
%69 = add nsw i32 %66, %68
102+
%70 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %64
103+
store i32 %69, i32* %70, align 4
104+
%71 = add nsw i32 %63, 1
105+
%72 = sext i32 %71 to i64
106+
%73 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %72
107+
%74 = load i32, i32* %73, align 4
108+
%75 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %72
109+
%76 = load i32, i32* %75, align 4
110+
%77 = add nsw i32 %74, %76
111+
%78 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %72
112+
store i32 %77, i32* %78, align 4
113+
%79 = add nsw i32 %71, 1
114+
%80 = sext i32 %79 to i64
115+
%81 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %80
116+
%82 = load i32, i32* %81, align 4
117+
%83 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %80
118+
%84 = load i32, i32* %83, align 4
119+
%85 = add nsw i32 %82, %84
120+
%86 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %80
121+
store i32 %85, i32* %86, align 4
122+
%87 = add nsw i32 %79, 1
123+
%88 = sext i32 %87 to i64
124+
%89 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %88
125+
%90 = load i32, i32* %89, align 4
126+
%91 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %88
127+
%92 = load i32, i32* %91, align 4
128+
%93 = add nsw i32 %90, %92
129+
%94 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %88
130+
store i32 %93, i32* %94, align 4
131+
%95 = add nsw i32 %87, 1
132+
%96 = sext i32 %95 to i64
133+
%97 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %96
134+
%98 = load i32, i32* %97, align 4
135+
%99 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %96
136+
%100 = load i32, i32* %99, align 4
137+
%101 = add nsw i32 %98, %100
138+
%102 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %96
139+
store i32 %101, i32* %102, align 4
140+
%103 = add nsw i32 %95, 1
141+
%104 = sext i32 %103 to i64
142+
%105 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %104
143+
%106 = load i32, i32* %105, align 4
144+
%107 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %104
145+
%108 = load i32, i32* %107, align 4
146+
%109 = add nsw i32 %106, %108
147+
%110 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %104
148+
store i32 %109, i32* %110, align 4
149+
%111 = add nsw i32 %103, 1
150+
%112 = sext i32 %111 to i64
151+
%113 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %112
152+
%114 = load i32, i32* %113, align 4
153+
%115 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %112
154+
%116 = load i32, i32* %115, align 4
155+
%117 = add nsw i32 %114, %116
156+
%118 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %112
157+
store i32 %117, i32* %118, align 4
158+
%119 = add nsw i32 %111, 1
159+
%120 = sext i32 %119 to i64
160+
%121 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %120
161+
%122 = load i32, i32* %121, align 4
162+
%123 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %120
163+
%124 = load i32, i32* %123, align 4
164+
%125 = add nsw i32 %122, %124
165+
%126 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %120
166+
store i32 %125, i32* %126, align 4
167+
%127 = add nsw i32 %119, 1
168+
%128 = sext i32 %127 to i64
169+
%129 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %128
170+
%130 = load i32, i32* %129, align 4
171+
%131 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %128
172+
%132 = load i32, i32* %131, align 4
173+
%133 = add nsw i32 %130, %132
174+
%134 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %128
175+
store i32 %133, i32* %134, align 4
176+
%135 = add nsw i32 %127, 1
177+
%136 = sext i32 %135 to i64
178+
%137 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %136
179+
%138 = load i32, i32* %137, align 4
180+
%139 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %136
181+
%140 = load i32, i32* %139, align 4
182+
%141 = add nsw i32 %138, %140
183+
%142 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %136
184+
store i32 %141, i32* %142, align 4
185+
%143 = add nsw i32 %135, 1
186+
%144 = sext i32 %143 to i64
187+
%145 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %144
188+
%146 = load i32, i32* %145, align 4
189+
%147 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %144
190+
%148 = load i32, i32* %147, align 4
191+
%149 = add nsw i32 %146, %148
192+
%150 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %144
193+
store i32 %149, i32* %150, align 4
194+
%151 = add nsw i32 %143, 1
195+
%152 = sext i32 %151 to i64
196+
%153 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %152
197+
%154 = load i32, i32* %153, align 4
198+
%155 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %152
199+
%156 = load i32, i32* %155, align 4
200+
%157 = add nsw i32 %154, %156
201+
%158 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %152
202+
store i32 %157, i32* %158, align 4
203+
%159 = add nsw i32 %151, 1
204+
%160 = sext i32 %159 to i64
205+
%161 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %160
206+
%162 = load i32, i32* %161, align 4
207+
%163 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %160
208+
%164 = load i32, i32* %163, align 4
209+
%165 = add nsw i32 %162, %164
210+
%166 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %160
211+
store i32 %165, i32* %166, align 4
212+
%167 = add nsw i32 %159, 1
213+
%168 = sext i32 %167 to i64
214+
%169 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %168
215+
%170 = load i32, i32* %169, align 4
216+
%171 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %168
217+
%172 = load i32, i32* %171, align 4
218+
%173 = add nsw i32 %170, %172
219+
%174 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %168
220+
store i32 %173, i32* %174, align 4
221+
%175 = add nsw i32 %167, 1
222+
%176 = sext i32 %175 to i64
223+
%177 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %176
224+
%178 = load i32, i32* %177, align 4
225+
%179 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %176
226+
%180 = load i32, i32* %179, align 4
227+
%181 = add nsw i32 %178, %180
228+
%182 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %176
229+
store i32 %181, i32* %182, align 4
230+
%183 = add nsw i32 %175, 1
231+
%184 = sext i32 %183 to i64
232+
%185 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %184
233+
%186 = load i32, i32* %185, align 4
234+
%187 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %184
235+
%188 = load i32, i32* %187, align 4
236+
%189 = add nsw i32 %186, %188
237+
%190 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %184
238+
store i32 %189, i32* %190, align 4
239+
%191 = add nsw i32 %183, 1
240+
%192 = sext i32 %191 to i64
241+
%193 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %192
242+
%194 = load i32, i32* %193, align 4
243+
%195 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %192
244+
%196 = load i32, i32* %195, align 4
245+
%197 = add nsw i32 %194, %196
246+
%198 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %192
247+
store i32 %197, i32* %198, align 4
248+
%199 = add nsw i32 %191, 1
249+
%200 = sext i32 %199 to i64
250+
%201 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %200
251+
%202 = load i32, i32* %201, align 4
252+
%203 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %200
253+
%204 = load i32, i32* %203, align 4
254+
%205 = add nsw i32 %202, %204
255+
%206 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %200
256+
store i32 %205, i32* %206, align 4
257+
%207 = add nsw i32 %199, 1
258+
%208 = add nsw i32 %207, 24
259+
br label %6
260+
}

0 commit comments

Comments
 (0)