Skip to content

Commit 384a5b0

Browse files
mrdaybirdfhahn
andauthored
[LAA] Use MaxStride instead of CommonStride to calculate MaxVF (#98142)
We bail out from MaxVF calculation if the strides are not the same. Instead, we are dependent on runtime checks, though not yet implemented. We could instead use the MaxStride to conservatively use an upper bound. This handles cases like the following: ```c #define LEN 256 * 256 float a[LEN]; void gather() { for (int i = 0; i < LEN - 1024 - 255; i++) { #pragma clang loop interleave(disable) #pragma clang loop unroll(disable) for (int j = 0; j < 256; j++) a[i + j + 1024] += a[j * 4 + i]; } } ``` --------- Co-authored-by: Florian Hahn <[email protected]>
1 parent 1a7cd92 commit 384a5b0

File tree

2 files changed

+162
-7
lines changed

2 files changed

+162
-7
lines changed

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2148,10 +2148,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21482148
"different type sizes\n");
21492149
return Dependence::Unknown;
21502150
}
2151-
2152-
if (!CommonStride)
2153-
return Dependence::Unknown;
2154-
21552151
// Bail out early if passed-in parameters make vectorization not feasible.
21562152
unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
21572153
VectorizerParams::VectorizationFactor : 1);
@@ -2162,7 +2158,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21622158

21632159
// It's not vectorizable if the distance is smaller than the minimum distance
21642160
// needed for a vectroized/unrolled version. Vectorizing one iteration in
2165-
// front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
2161+
// front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
21662162
// (No need to plus the last gap distance).
21672163
//
21682164
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
@@ -2186,11 +2182,14 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21862182
// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
21872183
// the minimum distance needed is 28, which is greater than distance. It is
21882184
// not safe to do vectorization.
2185+
//
2186+
// We use MaxStride (maximum of src and sink strides) to get a conservative
2187+
// lower bound on the MinDistanceNeeded in case of different strides.
21892188

21902189
// We know that Dist is positive, but it may not be constant. Use the signed
21912190
// minimum for computations below, as this ensures we compute the closest
21922191
// possible dependence distance.
2193-
uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
2192+
uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize;
21942193
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
21952194
if (!ConstDist) {
21962195
// For non-constant distances, we checked the lower bound of the
@@ -2236,7 +2235,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
22362235
couldPreventStoreLoadForward(MinDistance, TypeByteSize, *CommonStride))
22372236
return Dependence::BackwardVectorizableButPreventsForwarding;
22382237

2239-
uint64_t MaxVF = MinDepDistBytes / *CommonStride;
2238+
uint64_t MaxVF = MinDepDistBytes / MaxStride;
22402239
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
22412240
<< " with max VF = " << MaxVF << '\n');
22422241

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -passes="print<access-info>" %s 2>&1 | FileCheck %s
3+
4+
@a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
5+
6+
; Generated from the following C code:
7+
; #define LEN 256 * 256
8+
; float a[LEN];
9+
;
10+
; void different_strides() {
11+
; for (int i = 0; i < LEN - 1024 - 255; i++) {
12+
; #pragma clang loop interleave(disable)
13+
; #pragma clang loop unroll(disable)
14+
; for (int j = 0; j < 256; j++)
15+
; a[i + j + 1024] += a[j * 4 + i];
16+
; }
17+
; }
18+
; The load and store have different strides(4 and 16 bytes respectively) but the store
19+
; is always at safe positive distance away from the load, thus BackwardVectorizable
20+
define void @different_strides_backward_vectorizable() {
21+
; CHECK-LABEL: 'different_strides_backward_vectorizable'
22+
; CHECK-NEXT: inner.body:
23+
; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits
24+
; CHECK-NEXT: Dependences:
25+
; CHECK-NEXT: BackwardVectorizable:
26+
; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
27+
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
28+
; CHECK-EMPTY:
29+
; CHECK-NEXT: Forward:
30+
; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
31+
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
32+
; CHECK-EMPTY:
33+
; CHECK-NEXT: Run-time memory checks:
34+
; CHECK-NEXT: Grouped accesses:
35+
; CHECK-EMPTY:
36+
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
37+
; CHECK-NEXT: SCEV assumptions:
38+
; CHECK-EMPTY:
39+
; CHECK-NEXT: Expressions re-written:
40+
; CHECK-NEXT: outer.header:
41+
; CHECK-NEXT: Report: loop is not the innermost loop
42+
; CHECK-NEXT: Dependences:
43+
; CHECK-NEXT: Run-time memory checks:
44+
; CHECK-NEXT: Grouped accesses:
45+
; CHECK-EMPTY:
46+
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
47+
; CHECK-NEXT: SCEV assumptions:
48+
; CHECK-EMPTY:
49+
; CHECK-NEXT: Expressions re-written:
50+
;
51+
entry:
52+
br label %outer.header
53+
54+
outer.header:
55+
%i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
56+
%0 = add nuw nsw i64 %i, 1024
57+
br label %inner.body
58+
59+
inner.body:
60+
%j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
61+
%1 = shl nuw nsw i64 %j, 2
62+
%2 = add nuw nsw i64 %1, %i
63+
%arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
64+
%3 = load float, ptr %arrayidx, align 4
65+
%4 = add nuw nsw i64 %0, %j
66+
%arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
67+
%5 = load float, ptr %arrayidx8, align 4
68+
%add9 = fadd fast float %5, %3
69+
store float %add9, ptr %arrayidx8, align 4
70+
%j.next = add nuw nsw i64 %j, 1
71+
%exitcond.not = icmp eq i64 %j.next, 256
72+
br i1 %exitcond.not, label %outer.latch, label %inner.body
73+
74+
outer.latch:
75+
%i.next = add nuw nsw i64 %i, 1
76+
%outerexitcond.not = icmp eq i64 %i.next, 64257
77+
br i1 %outerexitcond.not, label %exit, label %outer.header
78+
79+
exit:
80+
ret void
81+
}
82+
83+
84+
; Generated from following C code:
85+
; void different_stride_and_not_vectorizable(){
86+
; for(int i = 0; i < LEN2; i++){
87+
; for(int j = 0 ; j < LEN; j++){
88+
; a[i + j + LEN] += a[i + 4*j];
89+
; }
90+
; }
91+
; }
92+
; The load and store have different strides, but the store and load are not at a
93+
; safe distance away from each other, thus not safe for vectorization.
94+
define void @different_stride_and_not_vectorizable() {
95+
; CHECK-LABEL: 'different_stride_and_not_vectorizable'
96+
; CHECK-NEXT: inner.body:
97+
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
98+
; CHECK-NEXT: Unknown data dependence.
99+
; CHECK-NEXT: Dependences:
100+
; CHECK-NEXT: Unknown:
101+
; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
102+
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
103+
; CHECK-EMPTY:
104+
; CHECK-NEXT: Forward:
105+
; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
106+
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
107+
; CHECK-EMPTY:
108+
; CHECK-NEXT: Run-time memory checks:
109+
; CHECK-NEXT: Grouped accesses:
110+
; CHECK-EMPTY:
111+
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
112+
; CHECK-NEXT: SCEV assumptions:
113+
; CHECK-EMPTY:
114+
; CHECK-NEXT: Expressions re-written:
115+
; CHECK-NEXT: outer.header:
116+
; CHECK-NEXT: Report: loop is not the innermost loop
117+
; CHECK-NEXT: Dependences:
118+
; CHECK-NEXT: Run-time memory checks:
119+
; CHECK-NEXT: Grouped accesses:
120+
; CHECK-EMPTY:
121+
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
122+
; CHECK-NEXT: SCEV assumptions:
123+
; CHECK-EMPTY:
124+
; CHECK-NEXT: Expressions re-written:
125+
;
126+
entry:
127+
br label %outer.header
128+
129+
outer.header:
130+
%i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
131+
%0 = add nuw nsw i64 %i, 256
132+
br label %inner.body
133+
134+
inner.body:
135+
%j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
136+
%1 = shl nuw nsw i64 %j, 2
137+
%2 = add nuw nsw i64 %1, %i
138+
%arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
139+
%3 = load float, ptr %arrayidx, align 4
140+
%4 = add nuw nsw i64 %0, %j
141+
%arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
142+
%5 = load float, ptr %arrayidx8, align 4
143+
%add9 = fadd fast float %5, %3
144+
store float %add9, ptr %arrayidx8, align 4
145+
%j.next = add nuw nsw i64 %j, 1
146+
%exitcond.not = icmp eq i64 %j.next, 256
147+
br i1 %exitcond.not, label %outer.latch, label %inner.body
148+
149+
outer.latch:
150+
%i.next = add nuw nsw i64 %i, 1
151+
%exitcond29.not = icmp eq i64 %i.next, 65536
152+
br i1 %exitcond29.not, label %exit, label %outer.header
153+
154+
exit:
155+
ret void
156+
}

0 commit comments

Comments
 (0)