Skip to content

Commit ca6235d

Browse files
committed
[LoopInterchange] Constrain load/stores in a loop
In the current state of the code, the transform computes entries for the dependency matrix until MaxMemInstrCount which is 100. After 100th entry it terminates and thus overall wastes compile-time. It would be nice if we can compute total number of entries upfront and early exit if the number of entries > 100. However, computing the number of entries is not always possible as it depends on two factors: 1. Number of load-store pairs in a loop. 2. Number of common loop levels for each of the pair. This patch just constrains the whole computation on the number of loads and store instructions in the loop. In an other approach, I experimented with computing 1 and constraining the number of pairs but that did not lead to any additional benefit in terms of compile-time. However, when other issues are fixed, I can revisit this approach.
1 parent f0b09df commit ca6235d

File tree

2 files changed

+272
-11
lines changed

2 files changed

+272
-11
lines changed

llvm/lib/Transforms/Scalar/LoopInterchange.cpp

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ using CharMatrix = std::vector<std::vector<char>>;
6666

6767
} // end anonymous namespace
6868

69-
// Maximum number of dependencies that can be handled in the dependency matrix.
70-
static const unsigned MaxMemInstrCount = 100;
69+
// Maximum number of load-stores that can be handled in the dependency matrix.
70+
static const unsigned MaxMemInstrCount = 64;
7171

7272
// Maximum loop depth supported.
7373
static const unsigned MaxLoopNestDepth = 10;
@@ -84,7 +84,8 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
8484

8585
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
8686
Loop *L, DependenceInfo *DI,
87-
ScalarEvolution *SE) {
87+
ScalarEvolution *SE,
88+
OptimizationRemarkEmitter *ORE) {
8889
using ValueVector = SmallVector<Value *, 16>;
8990

9091
ValueVector MemInstr;
@@ -110,6 +111,16 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
110111
LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
111112
<< " Loads and Stores to analyze\n");
112113

114+
if (MemInstr.size() > MaxMemInstrCount) {
115+
LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
116+
<< MaxMemInstrCount << " load stores in a loop\n");
117+
ORE->emit([&]() {
118+
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
119+
L->getStartLoc(), L->getHeader())
120+
<< "Reached maximum loads/stores than can be supported in the loop.";
121+
});
122+
return false;
123+
}
113124
ValueVector::iterator I, IE, J, JE;
114125
StringSet<> Seen;
115126

@@ -161,12 +172,6 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
161172
// Make sure we only add unique entries to the dependency matrix.
162173
if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
163174
DepMatrix.push_back(Dep);
164-
165-
if (DepMatrix.size() > MaxMemInstrCount) {
166-
LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
167-
<< " dependencies inside loop\n");
168-
return false;
169-
}
170175
}
171176
}
172177
}
@@ -450,7 +455,7 @@ struct LoopInterchange {
450455
CharMatrix DependencyMatrix;
451456
Loop *OuterMostLoop = *(LoopList.begin());
452457
if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
453-
OuterMostLoop, DI, SE)) {
458+
OuterMostLoop, DI, SE, ORE)) {
454459
LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
455460
return false;
456461
}
@@ -1728,7 +1733,6 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
17281733
// Ensure minimum depth of the loop nest to do the interchange.
17291734
if (!hasMinimumLoopDepth(LoopList))
17301735
return PreservedAnalyses::all();
1731-
17321736
DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
17331737
std::unique_ptr<CacheCost> CC =
17341738
CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
; REQUIRES: asserts
2+
; RUN: opt < %s -passes=loop-interchange -debug -disable-output 2>&1 | FileCheck %s
3+
target triple = "aarch64-unknown-linux-gnu"
4+
5+
@A = dso_local local_unnamed_addr global [2048 x [2048 x i32]] zeroinitializer, align 4
6+
@B = dso_local local_unnamed_addr global [2048 x [2048 x i32]] zeroinitializer, align 4
7+
@C = dso_local local_unnamed_addr global [2048 x [2048 x i32]] zeroinitializer, align 4
8+
9+
; CHECK: The transform doesn't support more than 64 load stores in a loop
10+
; CHECK: Populating dependency matrix failed
11+
define dso_local noundef i32 @many_load_stores() {
12+
br label %1
13+
14+
1: ; preds = %9, %0
15+
%2 = phi i32 [ 0, %0 ], [ %10, %9 ]
16+
%3 = icmp slt i32 %2, 2048
17+
br i1 %3, label %5, label %4
18+
19+
4: ; preds = %1
20+
ret i32 0
21+
22+
5: ; preds = %1
23+
br label %6
24+
25+
6: ; preds = %11, %5
26+
%7 = phi i32 [ 0, %5 ], [ %208, %11 ]
27+
%8 = icmp slt i32 %7, 85
28+
br i1 %8, label %11, label %9
29+
30+
9: ; preds = %6
31+
%10 = add nsw i32 %2, 1
32+
br label %1
33+
34+
11: ; preds = %6
35+
%12 = sext i32 %2 to i64
36+
%13 = getelementptr inbounds [2048 x [2048 x i32]], [2048 x [2048 x i32]]* @B, i64 0, i64 %12
37+
%14 = sext i32 %7 to i64
38+
%15 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %14
39+
%16 = load i32, i32* %15, align 4
40+
%17 = getelementptr inbounds [2048 x [2048 x i32]], [2048 x [2048 x i32]]* @C, i64 0, i64 %12
41+
%18 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %14
42+
%19 = load i32, i32* %18, align 4
43+
%20 = add nsw i32 %16, %19
44+
%21 = getelementptr inbounds [2048 x [2048 x i32]], [2048 x [2048 x i32]]* @A, i64 0, i64 %12
45+
%22 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %14
46+
store i32 %20, i32* %22, align 4
47+
%23 = add nsw i32 %7, 1
48+
%24 = sext i32 %23 to i64
49+
%25 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %24
50+
%26 = load i32, i32* %25, align 4
51+
%27 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %24
52+
%28 = load i32, i32* %27, align 4
53+
%29 = add nsw i32 %26, %28
54+
%30 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %24
55+
store i32 %29, i32* %30, align 4
56+
%31 = add nsw i32 %23, 1
57+
%32 = sext i32 %31 to i64
58+
%33 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %32
59+
%34 = load i32, i32* %33, align 4
60+
%35 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %32
61+
%36 = load i32, i32* %35, align 4
62+
%37 = add nsw i32 %34, %36
63+
%38 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %32
64+
store i32 %37, i32* %38, align 4
65+
%39 = add nsw i32 %31, 1
66+
%40 = sext i32 %39 to i64
67+
%41 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %40
68+
%42 = load i32, i32* %41, align 4
69+
%43 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %40
70+
%44 = load i32, i32* %43, align 4
71+
%45 = add nsw i32 %42, %44
72+
%46 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %40
73+
store i32 %45, i32* %46, align 4
74+
%47 = add nsw i32 %39, 1
75+
%48 = sext i32 %47 to i64
76+
%49 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %48
77+
%50 = load i32, i32* %49, align 4
78+
%51 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %48
79+
%52 = load i32, i32* %51, align 4
80+
%53 = add nsw i32 %50, %52
81+
%54 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %48
82+
store i32 %53, i32* %54, align 4
83+
%55 = add nsw i32 %47, 1
84+
%56 = sext i32 %55 to i64
85+
%57 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %56
86+
%58 = load i32, i32* %57, align 4
87+
%59 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %56
88+
%60 = load i32, i32* %59, align 4
89+
%61 = add nsw i32 %58, %60
90+
%62 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %56
91+
store i32 %61, i32* %62, align 4
92+
%63 = add nsw i32 %55, 1
93+
%64 = sext i32 %63 to i64
94+
%65 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %64
95+
%66 = load i32, i32* %65, align 4
96+
%67 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %64
97+
%68 = load i32, i32* %67, align 4
98+
%69 = add nsw i32 %66, %68
99+
%70 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %64
100+
store i32 %69, i32* %70, align 4
101+
%71 = add nsw i32 %63, 1
102+
%72 = sext i32 %71 to i64
103+
%73 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %72
104+
%74 = load i32, i32* %73, align 4
105+
%75 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %72
106+
%76 = load i32, i32* %75, align 4
107+
%77 = add nsw i32 %74, %76
108+
%78 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %72
109+
store i32 %77, i32* %78, align 4
110+
%79 = add nsw i32 %71, 1
111+
%80 = sext i32 %79 to i64
112+
%81 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %80
113+
%82 = load i32, i32* %81, align 4
114+
%83 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %80
115+
%84 = load i32, i32* %83, align 4
116+
%85 = add nsw i32 %82, %84
117+
%86 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %80
118+
store i32 %85, i32* %86, align 4
119+
%87 = add nsw i32 %79, 1
120+
%88 = sext i32 %87 to i64
121+
%89 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %88
122+
%90 = load i32, i32* %89, align 4
123+
%91 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %88
124+
%92 = load i32, i32* %91, align 4
125+
%93 = add nsw i32 %90, %92
126+
%94 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %88
127+
store i32 %93, i32* %94, align 4
128+
%95 = add nsw i32 %87, 1
129+
%96 = sext i32 %95 to i64
130+
%97 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %96
131+
%98 = load i32, i32* %97, align 4
132+
%99 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %96
133+
%100 = load i32, i32* %99, align 4
134+
%101 = add nsw i32 %98, %100
135+
%102 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %96
136+
store i32 %101, i32* %102, align 4
137+
%103 = add nsw i32 %95, 1
138+
%104 = sext i32 %103 to i64
139+
%105 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %104
140+
%106 = load i32, i32* %105, align 4
141+
%107 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %104
142+
%108 = load i32, i32* %107, align 4
143+
%109 = add nsw i32 %106, %108
144+
%110 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %104
145+
store i32 %109, i32* %110, align 4
146+
%111 = add nsw i32 %103, 1
147+
%112 = sext i32 %111 to i64
148+
%113 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %112
149+
%114 = load i32, i32* %113, align 4
150+
%115 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %112
151+
%116 = load i32, i32* %115, align 4
152+
%117 = add nsw i32 %114, %116
153+
%118 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %112
154+
store i32 %117, i32* %118, align 4
155+
%119 = add nsw i32 %111, 1
156+
%120 = sext i32 %119 to i64
157+
%121 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %120
158+
%122 = load i32, i32* %121, align 4
159+
%123 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %120
160+
%124 = load i32, i32* %123, align 4
161+
%125 = add nsw i32 %122, %124
162+
%126 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %120
163+
store i32 %125, i32* %126, align 4
164+
%127 = add nsw i32 %119, 1
165+
%128 = sext i32 %127 to i64
166+
%129 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %128
167+
%130 = load i32, i32* %129, align 4
168+
%131 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %128
169+
%132 = load i32, i32* %131, align 4
170+
%133 = add nsw i32 %130, %132
171+
%134 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %128
172+
store i32 %133, i32* %134, align 4
173+
%135 = add nsw i32 %127, 1
174+
%136 = sext i32 %135 to i64
175+
%137 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %136
176+
%138 = load i32, i32* %137, align 4
177+
%139 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %136
178+
%140 = load i32, i32* %139, align 4
179+
%141 = add nsw i32 %138, %140
180+
%142 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %136
181+
store i32 %141, i32* %142, align 4
182+
%143 = add nsw i32 %135, 1
183+
%144 = sext i32 %143 to i64
184+
%145 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %144
185+
%146 = load i32, i32* %145, align 4
186+
%147 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %144
187+
%148 = load i32, i32* %147, align 4
188+
%149 = add nsw i32 %146, %148
189+
%150 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %144
190+
store i32 %149, i32* %150, align 4
191+
%151 = add nsw i32 %143, 1
192+
%152 = sext i32 %151 to i64
193+
%153 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %152
194+
%154 = load i32, i32* %153, align 4
195+
%155 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %152
196+
%156 = load i32, i32* %155, align 4
197+
%157 = add nsw i32 %154, %156
198+
%158 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %152
199+
store i32 %157, i32* %158, align 4
200+
%159 = add nsw i32 %151, 1
201+
%160 = sext i32 %159 to i64
202+
%161 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %160
203+
%162 = load i32, i32* %161, align 4
204+
%163 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %160
205+
%164 = load i32, i32* %163, align 4
206+
%165 = add nsw i32 %162, %164
207+
%166 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %160
208+
store i32 %165, i32* %166, align 4
209+
%167 = add nsw i32 %159, 1
210+
%168 = sext i32 %167 to i64
211+
%169 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %168
212+
%170 = load i32, i32* %169, align 4
213+
%171 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %168
214+
%172 = load i32, i32* %171, align 4
215+
%173 = add nsw i32 %170, %172
216+
%174 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %168
217+
store i32 %173, i32* %174, align 4
218+
%175 = add nsw i32 %167, 1
219+
%176 = sext i32 %175 to i64
220+
%177 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %176
221+
%178 = load i32, i32* %177, align 4
222+
%179 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %176
223+
%180 = load i32, i32* %179, align 4
224+
%181 = add nsw i32 %178, %180
225+
%182 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %176
226+
store i32 %181, i32* %182, align 4
227+
%183 = add nsw i32 %175, 1
228+
%184 = sext i32 %183 to i64
229+
%185 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %184
230+
%186 = load i32, i32* %185, align 4
231+
%187 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %184
232+
%188 = load i32, i32* %187, align 4
233+
%189 = add nsw i32 %186, %188
234+
%190 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %184
235+
store i32 %189, i32* %190, align 4
236+
%191 = add nsw i32 %183, 1
237+
%192 = sext i32 %191 to i64
238+
%193 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %192
239+
%194 = load i32, i32* %193, align 4
240+
%195 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %192
241+
%196 = load i32, i32* %195, align 4
242+
%197 = add nsw i32 %194, %196
243+
%198 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %192
244+
store i32 %197, i32* %198, align 4
245+
%199 = add nsw i32 %191, 1
246+
%200 = sext i32 %199 to i64
247+
%201 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %200
248+
%202 = load i32, i32* %201, align 4
249+
%203 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %200
250+
%204 = load i32, i32* %203, align 4
251+
%205 = add nsw i32 %202, %204
252+
%206 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %200
253+
store i32 %205, i32* %206, align 4
254+
%207 = add nsw i32 %199, 1
255+
%208 = add nsw i32 %207, 24
256+
br label %6
257+
}

0 commit comments

Comments
 (0)