Skip to content

Commit 6eba277

Browse files
authored
[LAA] Scale strides using type-size (NFC) (#124529)
Change getDependenceDistanceStrideAndSize to scale strides by TypeByteSize, scaling the returned CommonStride and MaxStride. Even though there is a seemingly-functional change of setting CommonStride when scaled strides are equal, it ends up being a non-functional change due to aggressive HasSameSize checking.
1 parent 9ffab56 commit 6eba277

File tree

2 files changed

+44
-43
lines changed

2 files changed

+44
-43
lines changed

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -367,15 +367,17 @@ class MemoryDepChecker {
367367
struct DepDistanceStrideAndSizeInfo {
368368
const SCEV *Dist;
369369

370-
/// Strides could either be scaled (in bytes, taking the size of the
371-
/// underlying type into account), or unscaled (in indexing units; unscaled
372-
/// stride = scaled stride / size of underlying type). Here, strides are
373-
/// unscaled.
370+
/// Strides here are scaled; i.e. in bytes, taking the size of the
371+
/// underlying type into account.
374372
uint64_t MaxStride;
375373
std::optional<uint64_t> CommonStride;
376374

377375
bool ShouldRetryWithRuntimeCheck;
376+
377+
/// TypeByteSize is either the common store size of both accesses, or 0 when
378+
/// store sizes mismatch.
378379
uint64_t TypeByteSize;
380+
379381
bool AIsWrite;
380382
bool BIsWrite;
381383

@@ -394,8 +396,9 @@ class MemoryDepChecker {
394396
/// there's no dependence or the analysis fails. Outlined to lambda to limit
395397
/// he scope of various temporary variables, like A/BPtr, StrideA/BPtr and
396398
/// others. Returns either the dependence result, if it could already be
397-
/// determined, or a struct containing (Distance, Stride, TypeSize, AIsWrite,
398-
/// BIsWrite).
399+
/// determined, or a DepDistanceStrideAndSizeInfo struct, noting that
400+
/// TypeByteSize could be 0 when store sizes mismatch, and this should be
401+
/// checked in the caller.
399402
std::variant<Dependence::DepType, DepDistanceStrideAndSizeInfo>
400403
getDependenceDistanceStrideAndSize(const MemAccessInfo &A, Instruction *AInst,
401404
const MemAccessInfo &B,

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 35 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1786,22 +1786,21 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
17861786
Status = S;
17871787
}
17881788

1789-
/// Given a dependence-distance \p Dist between two
1790-
/// memory accesses, that have strides in the same direction whose absolute
1791-
/// value of the maximum stride is given in \p MaxStride, and that have the same
1792-
/// type size \p TypeByteSize, in a loop whose maximum backedge taken count is
1793-
/// \p MaxBTC, check if it is possible to prove statically that the dependence
1789+
/// Given a dependence-distance \p Dist between two memory accesses, that have
1790+
/// strides in the same direction whose absolute value of the maximum stride is
1791+
/// given in \p MaxStride, in a loop whose maximum backedge taken count is \p
1792+
/// MaxBTC, check if it is possible to prove statically that the dependence
17941793
/// distance is larger than the range that the accesses will travel through the
17951794
/// execution of the loop. If so, return true; false otherwise. This is useful
17961795
/// for example in loops such as the following (PR31098):
1796+
///
17971797
/// for (i = 0; i < D; ++i) {
17981798
/// = out[i];
17991799
/// out[i+D] =
18001800
/// }
18011801
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
18021802
const SCEV &MaxBTC, const SCEV &Dist,
1803-
uint64_t MaxStride,
1804-
uint64_t TypeByteSize) {
1803+
uint64_t MaxStride) {
18051804

18061805
// If we can prove that
18071806
// (**) |Dist| > MaxBTC * Step
@@ -1820,8 +1819,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
18201819
// will be executed only if LoopCount >= VF, proving distance >= LoopCount
18211820
// also guarantees that distance >= VF.
18221821
//
1823-
const uint64_t ByteStride = MaxStride * TypeByteSize;
1824-
const SCEV *Step = SE.getConstant(MaxBTC.getType(), ByteStride);
1822+
const SCEV *Step = SE.getConstant(MaxBTC.getType(), MaxStride);
18251823
const SCEV *Product = SE.getMulExpr(&MaxBTC, Step);
18261824

18271825
const SCEV *CastedDist = &Dist;
@@ -1851,8 +1849,8 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
18511849
}
18521850

18531851
/// Check the dependence for two accesses with the same stride \p Stride.
1854-
/// \p Distance is the positive distance and \p TypeByteSize is type size in
1855-
/// bytes.
1852+
/// \p Distance is the positive distance in bytes, and \p TypeByteSize is type
1853+
/// size in bytes.
18561854
///
18571855
/// \returns true if they are independent.
18581856
static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
@@ -1865,25 +1863,23 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
18651863
if (Distance % TypeByteSize)
18661864
return false;
18671865

1868-
uint64_t ScaledDist = Distance / TypeByteSize;
1869-
1870-
// No dependence if the scaled distance is not multiple of the stride.
1866+
// No dependence if the distance is not multiple of the stride.
18711867
// E.g.
18721868
// for (i = 0; i < 1024 ; i += 4)
18731869
// A[i+2] = A[i] + 1;
18741870
//
1875-
// Two accesses in memory (scaled distance is 2, stride is 4):
1871+
// Two accesses in memory (distance is 2, stride is 4):
18761872
// | A[0] | | | | A[4] | | | |
18771873
// | | | A[2] | | | | A[6] | |
18781874
//
18791875
// E.g.
18801876
// for (i = 0; i < 1024 ; i += 3)
18811877
// A[i+4] = A[i] + 1;
18821878
//
1883-
// Two accesses in memory (scaled distance is 4, stride is 3):
1879+
// Two accesses in memory (distance is 4, stride is 3):
18841880
// | A[0] | | | A[3] | | | A[6] | | |
18851881
// | | | | | A[4] | | | A[7] | |
1886-
return ScaledDist % Stride;
1882+
return Distance % Stride;
18871883
}
18881884

18891885
std::variant<MemoryDepChecker::Dependence::DepType,
@@ -1992,25 +1988,28 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
19921988
return MemoryDepChecker::Dependence::Unknown;
19931989
}
19941990

1995-
uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
1996-
bool HasSameSize =
1997-
DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
1998-
if (!HasSameSize)
1999-
TypeByteSize = 0;
1991+
TypeSize AStoreSz = DL.getTypeStoreSize(ATy);
1992+
TypeSize BStoreSz = DL.getTypeStoreSize(BTy);
1993+
1994+
// If store sizes are not the same, set TypeByteSize to zero, so we can check
1995+
// it in the caller isDependent.
1996+
uint64_t ASz = DL.getTypeAllocSize(ATy);
1997+
uint64_t BSz = DL.getTypeAllocSize(BTy);
1998+
uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0;
20001999

2001-
StrideAPtrInt = std::abs(StrideAPtrInt);
2002-
StrideBPtrInt = std::abs(StrideBPtrInt);
2000+
uint64_t StrideAScaled = std::abs(StrideAPtrInt) * ASz;
2001+
uint64_t StrideBScaled = std::abs(StrideBPtrInt) * BSz;
20032002

2004-
uint64_t MaxStride = std::max(StrideAPtrInt, StrideBPtrInt);
2003+
uint64_t MaxStride = std::max(StrideAScaled, StrideBScaled);
20052004

20062005
std::optional<uint64_t> CommonStride;
2007-
if (StrideAPtrInt == StrideBPtrInt)
2008-
CommonStride = StrideAPtrInt;
2006+
if (StrideAScaled == StrideBScaled)
2007+
CommonStride = StrideAScaled;
20092008

20102009
// TODO: Historically, we don't retry with runtime checks unless the
20112010
// (unscaled) strides are the same. Fix this once the condition for runtime
20122011
// checks in isDependent is fixed.
2013-
bool ShouldRetryWithRuntimeCheck = CommonStride.has_value();
2012+
bool ShouldRetryWithRuntimeCheck = StrideAPtrInt == StrideBPtrInt;
20142013

20152014
return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
20162015
ShouldRetryWithRuntimeCheck, TypeByteSize,
@@ -2050,9 +2049,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20502049
// upper bound of the number of iterations), the accesses are independet, i.e.
20512050
// they are far enough appart that accesses won't access the same location
20522051
// across all loop ierations.
2053-
if (HasSameSize && isSafeDependenceDistance(
2054-
DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
2055-
*Dist, MaxStride, TypeByteSize))
2052+
if (HasSameSize &&
2053+
isSafeDependenceDistance(
2054+
DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist, MaxStride))
20562055
return Dependence::NoDep;
20572056

20582057
const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
@@ -2156,8 +2155,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21562155

21572156
// It's not vectorizable if the distance is smaller than the minimum distance
21582157
// needed for a vectroized/unrolled version. Vectorizing one iteration in
2159-
// front needs TypeByteSize * Stride. Vectorizing the last iteration needs
2160-
// TypeByteSize (No need to plus the last gap distance).
2158+
// front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
2159+
// (No need to plus the last gap distance).
21612160
//
21622161
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
21632162
// foo(int *A) {
@@ -2166,7 +2165,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21662165
// B[i] = A[i] + 1;
21672166
// }
21682167
//
2169-
// Two accesses in memory (stride is 2):
2168+
// Two accesses in memory (stride is 4 * 2):
21702169
// | A[0] | | A[2] | | A[4] | | A[6] | |
21712170
// | B[0] | | B[2] | | B[4] |
21722171
//
@@ -2184,8 +2183,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21842183
// We know that Dist is positive, but it may not be constant. Use the signed
21852184
// minimum for computations below, as this ensures we compute the closest
21862185
// possible dependence distance.
2187-
uint64_t MinDistanceNeeded =
2188-
TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
2186+
uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
21892187
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
21902188
if (!ConstDist) {
21912189
// For non-constant distances, we checked the lower bound of the
@@ -2241,7 +2239,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
22412239

22422240
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
22432241
// since there is a backwards dependency.
2244-
uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
2242+
uint64_t MaxVF = MinDepDistBytes / *CommonStride;
22452243
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
22462244
<< " with max VF = " << MaxVF << '\n');
22472245

0 commit comments

Comments
 (0)