Skip to content

Commit 28767af

Browse files
authored
[LAA] Support backward dependences with non-constant distance. (llvm#91525)
Following up to 933f492, also update the code reasoning about backwards dependences to support non-constant distances. Update the code to use the signed minimum distance instead of a constant distance This means e checked the lower bound of the dependence distance and the distance may be larger at runtime (and safe for vectorization). Whether to classify it as Unknown or Backwards depends on the vector width and LAA was updated to take TTI to get the maximum vector register width. If the minimum dependence distance is larger than the max vector width, we consider it as backwards-vectorizable. Otherwise we classify them as Unknown, so we re-try with runtime checks. PR: llvm#91525
1 parent 2e8d815 commit 28767af

File tree

7 files changed

+247
-169
lines changed

7 files changed

+247
-169
lines changed

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,10 @@ class MemoryDepChecker {
181181
const SmallVectorImpl<Instruction *> &Instrs) const;
182182
};
183183

184-
MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L)
185-
: PSE(PSE), InnermostLoop(L) {}
184+
MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L,
185+
unsigned MaxTargetVectorWidthInBits)
186+
: PSE(PSE), InnermostLoop(L),
187+
MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}
186188

187189
/// Register the location (instructions are given increasing numbers)
188190
/// of a write access.
@@ -314,6 +316,12 @@ class MemoryDepChecker {
314316
/// RecordDependences is true.
315317
SmallVector<Dependence, 8> Dependences;
316318

319+
/// The maximum width of a target's vector registers multiplied by 2 to also
320+
/// roughly account for additional interleaving. Is used to decide if a
321+
/// backwards dependence with non-constant stride should be classified as
322+
/// backwards-vectorizable or unknown (triggering a runtime check).
323+
unsigned MaxTargetVectorWidthInBits = 0;
324+
317325
/// Check whether there is a plausible dependence between the two
318326
/// accesses.
319327
///
@@ -575,8 +583,9 @@ class RuntimePointerChecking {
575583
/// PSE must be emitted in order for the results of this analysis to be valid.
576584
class LoopAccessInfo {
577585
public:
578-
LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI,
579-
AAResults *AA, DominatorTree *DT, LoopInfo *LI);
586+
LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetTransformInfo *TTI,
587+
const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT,
588+
LoopInfo *LI);
580589

581590
/// Return true we can analyze the memory accesses in the loop and there are
582591
/// no memory dependence cycles. Note that for dependences between loads &
@@ -799,12 +808,14 @@ class LoopAccessInfoManager {
799808
AAResults &AA;
800809
DominatorTree &DT;
801810
LoopInfo &LI;
811+
TargetTransformInfo *TTI;
802812
const TargetLibraryInfo *TLI = nullptr;
803813

804814
public:
805815
LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT,
806-
LoopInfo &LI, const TargetLibraryInfo *TLI)
807-
: SE(SE), AA(AA), DT(DT), LI(LI), TLI(TLI) {}
816+
LoopInfo &LI, TargetTransformInfo *TTI,
817+
const TargetLibraryInfo *TLI)
818+
: SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI) {}
808819

809820
const LoopAccessInfo &getInfo(Loop &L);
810821

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 67 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "llvm/Analysis/ScalarEvolution.h"
3232
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
3333
#include "llvm/Analysis/TargetLibraryInfo.h"
34+
#include "llvm/Analysis/TargetTransformInfo.h"
3435
#include "llvm/Analysis/ValueTracking.h"
3536
#include "llvm/Analysis/VectorUtils.h"
3637
#include "llvm/IR/BasicBlock.h"
@@ -2122,32 +2123,34 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
21222123
return Dependence::Forward;
21232124
}
21242125

2125-
if (!C) {
2126-
// TODO: FoundNonConstantDistanceDependence is used as a necessary condition
2127-
// to consider retrying with runtime checks. Historically, we did not set it
2128-
// when strides were different but there is no inherent reason to.
2126+
int64_t MinDistance = SE.getSignedRangeMin(Dist).getSExtValue();
2127+
// Below we only handle strictly positive distances.
2128+
if (MinDistance <= 0) {
21292129
FoundNonConstantDistanceDependence |= CommonStride.has_value();
2130-
LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
21312130
return Dependence::Unknown;
21322131
}
21332132

2134-
if (!SE.isKnownPositive(Dist))
2135-
return Dependence::Unknown;
2133+
if (!isa<SCEVConstant>(Dist)) {
2134+
// Previously this case would be treated as Unknown, possibly setting
2135+
// FoundNonConstantDistanceDependence to force re-trying with runtime
2136+
// checks. Until the TODO below is addressed, set it here to preserve
2137+
// original behavior w.r.t. re-trying with runtime checks.
2138+
// TODO: FoundNonConstantDistanceDependence is used as a necessary
2139+
// condition to consider retrying with runtime checks. Historically, we
2140+
// did not set it when strides were different but there is no inherent
2141+
// reason to.
2142+
FoundNonConstantDistanceDependence |= CommonStride.has_value();
2143+
}
21362144

21372145
if (!HasSameSize) {
21382146
LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
21392147
"different type sizes\n");
21402148
return Dependence::Unknown;
21412149
}
21422150

2143-
// The logic below currently only supports StrideA == StrideB, i.e. there's a
2144-
// common stride.
21452151
if (!CommonStride)
21462152
return Dependence::Unknown;
21472153

2148-
const APInt &Val = C->getAPInt();
2149-
int64_t Distance = Val.getSExtValue();
2150-
21512154
// Bail out early if passed-in parameters make vectorization not feasible.
21522155
unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
21532156
VectorizerParams::VectorizationFactor : 1);
@@ -2172,8 +2175,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
21722175
// | A[0] | | A[2] | | A[4] | | A[6] | |
21732176
// | B[0] | | B[2] | | B[4] |
21742177
//
2175-
// Distance needs for vectorizing iterations except the last iteration:
2176-
// 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
2178+
// MinDistance needs for vectorizing iterations except the last iteration:
2179+
// 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4.
21772180
// So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
21782181
//
21792182
// If MinNumIter is 2, it is vectorizable as the minimum distance needed is
@@ -2182,11 +2185,22 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
21822185
// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
21832186
// the minimum distance needed is 28, which is greater than distance. It is
21842187
// not safe to do vectorization.
2188+
2189+
// We know that Dist is positive, but it may not be constant. Use the signed
2190+
// minimum for computations below, as this ensures we compute the closest
2191+
// possible dependence distance.
21852192
uint64_t MinDistanceNeeded =
2186-
TypeByteSize * (*CommonStride) * (MinNumIter - 1) + TypeByteSize;
2187-
if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
2188-
LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
2189-
<< Distance << '\n');
2193+
TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
2194+
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
2195+
if (!isa<SCEVConstant>(Dist)) {
2196+
// For non-constant distances, we checked the lower bound of the
2197+
// dependence distance and the distance may be larger at runtime (and safe
2198+
// for vectorization). Classify it as Unknown, so we re-try with runtime
2199+
// checks.
2200+
return Dependence::Unknown;
2201+
}
2202+
LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
2203+
<< MinDistance << '\n');
21902204
return Dependence::Backward;
21912205
}
21922206

@@ -2215,12 +2229,13 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
22152229
// is 8, which is less than 2 and forbidden vectorization, But actually
22162230
// both A and B could be vectorized by 2 iterations.
22172231
MinDepDistBytes =
2218-
std::min(static_cast<uint64_t>(Distance), MinDepDistBytes);
2232+
std::min(static_cast<uint64_t>(MinDistance), MinDepDistBytes);
22192233

22202234
bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
22212235
uint64_t MinDepDistBytesOld = MinDepDistBytes;
22222236
if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2223-
couldPreventStoreLoadForward(Distance, TypeByteSize)) {
2237+
isa<SCEVConstant>(Dist) &&
2238+
couldPreventStoreLoadForward(MinDistance, TypeByteSize)) {
22242239
// Sanity check that we didn't update MinDepDistBytes when calling
22252240
// couldPreventStoreLoadForward
22262241
assert(MinDepDistBytes == MinDepDistBytesOld &&
@@ -2232,10 +2247,18 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
22322247

22332248
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
22342249
// since there is a backwards dependency.
2235-
uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * (*CommonStride));
2236-
LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
2250+
uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
2251+
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
22372252
<< " with max VF = " << MaxVF << '\n');
2253+
22382254
uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2255+
if (!isa<SCEVConstant>(Dist) && MaxVFInBits < MaxTargetVectorWidthInBits) {
2256+
// For non-constant distances, we checked the lower bound of the dependence
2257+
// distance and the distance may be larger at runtime (and safe for
2258+
// vectorization). Classify it as Unknown, so we re-try with runtime checks.
2259+
return Dependence::Unknown;
2260+
}
2261+
22392262
MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
22402263
return Dependence::BackwardVectorizable;
22412264
}
@@ -3018,11 +3041,28 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
30183041
}
30193042

30203043
LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
3044+
const TargetTransformInfo *TTI,
30213045
const TargetLibraryInfo *TLI, AAResults *AA,
30223046
DominatorTree *DT, LoopInfo *LI)
30233047
: PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
3024-
PtrRtChecking(nullptr),
3025-
DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
3048+
PtrRtChecking(nullptr), TheLoop(L) {
3049+
unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
3050+
if (TTI) {
3051+
TypeSize FixedWidth =
3052+
TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
3053+
if (FixedWidth.isNonZero()) {
3054+
// Scale the vector width by 2 as rough estimate to also consider
3055+
// interleaving.
3056+
MaxTargetVectorWidthInBits = FixedWidth.getFixedValue() * 2;
3057+
}
3058+
3059+
TypeSize ScalableWidth =
3060+
TTI->getRegisterBitWidth(TargetTransformInfo::RGK_ScalableVector);
3061+
if (ScalableWidth.isNonZero())
3062+
MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
3063+
}
3064+
DepChecker =
3065+
std::make_unique<MemoryDepChecker>(*PSE, L, MaxTargetVectorWidthInBits);
30263066
PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
30273067
if (canAnalyzeLoop()) {
30283068
analyzeLoop(AA, LI, TLI, DT);
@@ -3082,7 +3122,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
30823122

30833123
if (I.second)
30843124
I.first->second =
3085-
std::make_unique<LoopAccessInfo>(&L, &SE, TLI, &AA, &DT, &LI);
3125+
std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT, &LI);
30863126

30873127
return *I.first->second;
30883128
}
@@ -3111,8 +3151,9 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
31113151
auto &AA = FAM.getResult<AAManager>(F);
31123152
auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
31133153
auto &LI = FAM.getResult<LoopAnalysis>(F);
3154+
auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
31143155
auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
3115-
return LoopAccessInfoManager(SE, AA, DT, LI, &TLI);
3156+
return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI);
31163157
}
31173158

31183159
AnalysisKey LoopAccessAnalysis::Key;

llvm/lib/Transforms/Scalar/LoopFlatten.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1005,7 +1005,7 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
10051005
// in simplified form, and also needs LCSSA. Running
10061006
// this pass will simplify all loops that contain inner loops,
10071007
// regardless of whether anything ends up being flattened.
1008-
LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, nullptr);
1008+
LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr);
10091009
for (Loop *InnerLoop : LN.getLoops()) {
10101010
auto *OuterLoop = InnerLoop->getParentLoop();
10111011
if (!OuterLoop)

llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
582582
const Function *F = L.getHeader()->getParent();
583583
OptimizationRemarkEmitter ORE(F);
584584

585-
LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr);
585+
LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr);
586586
if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT))
587587
return PreservedAnalyses::all();
588588
return getLoopPassPreservedAnalyses();

llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
; CHECK: function 'Test':
2525
; CHECK: .inner:
26-
; CHECK-NEXT: Memory dependences are safe with run-time checks
26+
; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits with run-time checks
2727
; CHECK-NEXT: Dependences:
2828
; CHECK-NEXT: Run-time memory checks:
2929
; CHECK: Check 0:

0 commit comments

Comments
 (0)