Skip to content

[LAA] Use MaxStride instead of CommonStride to calculate MaxVF #98142

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2148,10 +2148,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
"different type sizes\n");
return Dependence::Unknown;
}

if (!CommonStride)
return Dependence::Unknown;

// Bail out early if passed-in parameters make vectorization not feasible.
unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
VectorizerParams::VectorizationFactor : 1);
Expand All @@ -2162,7 +2158,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,

// It's not vectorizable if the distance is smaller than the minimum distance
// needed for a vectroized/unrolled version. Vectorizing one iteration in
// front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
// front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
// (No need to plus the last gap distance).
//
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
Expand All @@ -2186,11 +2182,14 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
// the minimum distance needed is 28, which is greater than distance. It is
// not safe to do vectorization.
//
// We use MaxStride (maximum of src and sink strides) to get a conservative
// lower bound on the MinDistanceNeeded in case of different strides.

// We know that Dist is positive, but it may not be constant. Use the signed
// minimum for computations below, as this ensures we compute the closest
// possible dependence distance.
uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize;
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
if (!ConstDist) {
// For non-constant distances, we checked the lower bound of the
Expand Down Expand Up @@ -2236,7 +2235,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
couldPreventStoreLoadForward(MinDistance, TypeByteSize, *CommonStride))
return Dependence::BackwardVectorizableButPreventsForwarding;

uint64_t MaxVF = MinDepDistBytes / *CommonStride;
uint64_t MaxVF = MinDepDistBytes / MaxStride;
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
<< " with max VF = " << MaxVF << '\n');

Expand Down
156 changes: 156 additions & 0 deletions llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes="print<access-info>" %s 2>&1 | FileCheck %s

@a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16

; Generated from the following C code:
; #define LEN 256 * 256
; float a[LEN];
;
; void different_strides() {
; for (int i = 0; i < LEN - 1024 - 255; i++) {
; #pragma clang loop interleave(disable)
; #pragma clang loop unroll(disable)
; for (int j = 0; j < 256; j++)
; a[i + j + 1024] += a[j * 4 + i];
; }
; }
; The load and store have different strides(4 and 16 bytes respectively) but the store
; is always at safe positive distance away from the load, thus BackwardVectorizable
define void @different_strides_backward_vectorizable() {
; CHECK-LABEL: 'different_strides_backward_vectorizable'
; CHECK-NEXT: inner.body:
; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits
; CHECK-NEXT: Dependences:
; CHECK-NEXT: BackwardVectorizable:
; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
; CHECK-EMPTY:
; CHECK-NEXT: Forward:
; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
; CHECK-NEXT: outer.header:
; CHECK-NEXT: Report: loop is not the innermost loop
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
;
entry:
br label %outer.header

outer.header:
%i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
%0 = add nuw nsw i64 %i, 1024
br label %inner.body

inner.body:
%j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
%1 = shl nuw nsw i64 %j, 2
%2 = add nuw nsw i64 %1, %i
%arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
%3 = load float, ptr %arrayidx, align 4
%4 = add nuw nsw i64 %0, %j
%arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
%5 = load float, ptr %arrayidx8, align 4
%add9 = fadd fast float %5, %3
store float %add9, ptr %arrayidx8, align 4
%j.next = add nuw nsw i64 %j, 1
%exitcond.not = icmp eq i64 %j.next, 256
br i1 %exitcond.not, label %outer.latch, label %inner.body

outer.latch:
%i.next = add nuw nsw i64 %i, 1
%outerexitcond.not = icmp eq i64 %i.next, 64257
br i1 %outerexitcond.not, label %exit, label %outer.header

exit:
ret void
}


; Generated from following C code:
; void different_stride_and_not_vectorizable(){
; for(int i = 0; i < LEN2; i++){
; for(int j = 0 ; j < LEN; j++){
; a[i + j + LEN] += a[i + 4*j];
; }
; }
; }
; The load and store have different strides, but the store and load are not at a
; safe distance away from each other, thus not safe for vectorization.
define void @different_stride_and_not_vectorizable() {
; CHECK-LABEL: 'different_stride_and_not_vectorizable'
; CHECK-NEXT: inner.body:
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
; CHECK-NEXT: Unknown data dependence.
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Unknown:
; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
; CHECK-EMPTY:
; CHECK-NEXT: Forward:
; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
; CHECK-NEXT: outer.header:
; CHECK-NEXT: Report: loop is not the innermost loop
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
;
entry:
br label %outer.header

outer.header:
%i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
%0 = add nuw nsw i64 %i, 256
br label %inner.body

inner.body:
%j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
%1 = shl nuw nsw i64 %j, 2
%2 = add nuw nsw i64 %1, %i
%arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
%3 = load float, ptr %arrayidx, align 4
%4 = add nuw nsw i64 %0, %j
%arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
%5 = load float, ptr %arrayidx8, align 4
%add9 = fadd fast float %5, %3
store float %add9, ptr %arrayidx8, align 4
%j.next = add nuw nsw i64 %j, 1
%exitcond.not = icmp eq i64 %j.next, 256
br i1 %exitcond.not, label %outer.latch, label %inner.body

outer.latch:
%i.next = add nuw nsw i64 %i, 1
%exitcond29.not = icmp eq i64 %i.next, 65536
br i1 %exitcond29.not, label %exit, label %outer.header

exit:
ret void
}
Loading