Skip to content

Commit bfedf64

Browse files
authored
[LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop (#96752)
Currently when we encounter a negative step in the induction variable isDereferenceableAndAlignedInLoop bails out because the element size is signed greater than the step. This patch adds support for negative steps in cases where we detect the start address for the load is of the form base + offset. In this case the address decrements in each iteration so we need to calculate the access size differently. I have done this by caling getStartAndEndForAccess from LoopAccessAnalysis.cpp. The motivation for this patch comes from PR #88385 where a reviewer requested reusing isDereferenceableAndAlignedInLoop, but that PR itself does support reverse loops. The changed test in LoopVectorize/X86/load-deref-pred.ll now passes because previously we were calculating the total access size incorrectly, whereas now it is 412 bytes and fits perfectly into the alloca.
1 parent f9350c9 commit bfedf64

File tree

5 files changed

+188
-266
lines changed

5 files changed

+188
-266
lines changed

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,25 @@ bool sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, const DataLayout &DL,
853853
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
854854
ScalarEvolution &SE, bool CheckType = true);
855855

856+
/// Calculate Start and End points of memory access.
857+
/// Let's assume A is the first access and B is a memory access on N-th loop
858+
/// iteration. Then B is calculated as:
859+
/// B = A + Step*N .
860+
/// Step value may be positive or negative.
861+
/// N is a calculated back-edge taken count:
862+
/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
863+
/// Start and End points are calculated in the following way:
864+
/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
865+
/// where SizeOfElt is the size of single memory access in bytes.
866+
///
867+
/// There is no conflict when the intervals are disjoint:
868+
/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
869+
std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
870+
const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount,
871+
ScalarEvolution *SE,
872+
DenseMap<std::pair<const SCEV *, Type *>,
873+
std::pair<const SCEV *, const SCEV *>> *PointerBounds);
874+
856875
class LoopAccessInfoManager {
857876
/// The cache.
858877
DenseMap<Loop *, std::unique_ptr<LoopAccessInfo>> LoopAccessInfoMap;

llvm/lib/Analysis/Loads.cpp

Lines changed: 57 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "llvm/Analysis/Loads.h"
1414
#include "llvm/Analysis/AliasAnalysis.h"
1515
#include "llvm/Analysis/AssumeBundleQueries.h"
16+
#include "llvm/Analysis/LoopAccessAnalysis.h"
1617
#include "llvm/Analysis/LoopInfo.h"
1718
#include "llvm/Analysis/MemoryBuiltins.h"
1819
#include "llvm/Analysis/MemoryLocation.h"
@@ -275,84 +276,88 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
275276
bool llvm::isDereferenceableAndAlignedInLoop(
276277
LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT,
277278
AssumptionCache *AC, SmallVectorImpl<const SCEVPredicate *> *Predicates) {
279+
const Align Alignment = LI->getAlign();
278280
auto &DL = LI->getDataLayout();
279281
Value *Ptr = LI->getPointerOperand();
280-
281282
APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
282283
DL.getTypeStoreSize(LI->getType()).getFixedValue());
283-
const Align Alignment = LI->getAlign();
284-
285-
Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
286284

287285
// If given a uniform (i.e. non-varying) address, see if we can prove the
288286
// access is safe within the loop w/o needing predication.
289287
if (L->isLoopInvariant(Ptr))
290-
return isDereferenceableAndAlignedPointer(Ptr, Alignment, EltSize, DL,
291-
HeaderFirstNonPHI, AC, &DT);
288+
return isDereferenceableAndAlignedPointer(
289+
Ptr, Alignment, EltSize, DL, L->getHeader()->getFirstNonPHI(), AC, &DT);
290+
291+
const SCEV *PtrScev = SE.getSCEV(Ptr);
292+
auto *AddRec = dyn_cast<SCEVAddRecExpr>(PtrScev);
292293

293-
// Otherwise, check to see if we have a repeating access pattern where we can
294-
// prove that all accesses are well aligned and dereferenceable.
295-
auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Ptr));
294+
// Check to see if we have a repeating access pattern and it's possible
295+
// to prove all accesses are well aligned.
296296
if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine())
297297
return false;
298+
298299
auto* Step = dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(SE));
299300
if (!Step)
300301
return false;
301302

302-
auto TC = SE.getSmallConstantMaxTripCount(L, Predicates);
303-
if (!TC)
303+
// For the moment, restrict ourselves to the case where the access size is a
304+
// multiple of the requested alignment and the base is aligned.
305+
// TODO: generalize if a case found which warrants
306+
if (EltSize.urem(Alignment.value()) != 0)
304307
return false;
305308

306309
// TODO: Handle overlapping accesses.
307-
// We should be computing AccessSize as (TC - 1) * Step + EltSize.
308-
if (EltSize.sgt(Step->getAPInt()))
310+
if (EltSize.ugt(Step->getAPInt().abs()))
311+
return false;
312+
313+
const SCEV *MaxBECount =
314+
SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates);
315+
if (isa<SCEVCouldNotCompute>(MaxBECount))
316+
return false;
317+
318+
const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess(
319+
L, PtrScev, LI->getType(), MaxBECount, &SE, nullptr);
320+
if (isa<SCEVCouldNotCompute>(AccessStart) ||
321+
isa<SCEVCouldNotCompute>(AccessEnd))
309322
return false;
310323

311-
// Compute the total access size for access patterns with unit stride and
312-
// patterns with gaps. For patterns with unit stride, Step and EltSize are the
313-
// same.
314-
// For patterns with gaps (i.e. non unit stride), we are
315-
// accessing EltSize bytes at every Step.
316-
APInt AccessSize = TC * Step->getAPInt();
324+
// Try to get the access size.
325+
const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart);
326+
APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff);
317327

318-
assert(SE.isLoopInvariant(AddRec->getStart(), L) &&
319-
"implied by addrec definition");
320328
Value *Base = nullptr;
321-
if (auto *StartS = dyn_cast<SCEVUnknown>(AddRec->getStart())) {
322-
Base = StartS->getValue();
323-
} else if (auto *StartS = dyn_cast<SCEVAddExpr>(AddRec->getStart())) {
324-
// Handle (NewBase + offset) as start value.
325-
const auto *Offset = dyn_cast<SCEVConstant>(StartS->getOperand(0));
326-
const auto *NewBase = dyn_cast<SCEVUnknown>(StartS->getOperand(1));
327-
if (StartS->getNumOperands() == 2 && Offset && NewBase) {
328-
// The following code below assumes the offset is unsigned, but GEP
329-
// offsets are treated as signed so we can end up with a signed value
330-
// here too. For example, suppose the initial PHI value is (i8 255),
331-
// the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
332-
if (Offset->getAPInt().isNegative())
333-
return false;
329+
APInt AccessSize;
330+
if (const SCEVUnknown *NewBase = dyn_cast<SCEVUnknown>(AccessStart)) {
331+
Base = NewBase->getValue();
332+
AccessSize = MaxPtrDiff;
333+
} else if (auto *MinAdd = dyn_cast<SCEVAddExpr>(AccessStart)) {
334+
if (MinAdd->getNumOperands() != 2)
335+
return false;
334336

335-
// For the moment, restrict ourselves to the case where the offset is a
336-
// multiple of the requested alignment and the base is aligned.
337-
// TODO: generalize if a case found which warrants
338-
if (Offset->getAPInt().urem(Alignment.value()) != 0)
339-
return false;
340-
Base = NewBase->getValue();
341-
bool Overflow = false;
342-
AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow);
343-
if (Overflow)
344-
return false;
345-
}
346-
}
337+
const auto *Offset = dyn_cast<SCEVConstant>(MinAdd->getOperand(0));
338+
const auto *NewBase = dyn_cast<SCEVUnknown>(MinAdd->getOperand(1));
339+
if (!Offset || !NewBase)
340+
return false;
347341

348-
if (!Base)
349-
return false;
342+
// The following code below assumes the offset is unsigned, but GEP
343+
// offsets are treated as signed so we can end up with a signed value
344+
// here too. For example, suppose the initial PHI value is (i8 255),
345+
// the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
346+
if (Offset->getAPInt().isNegative())
347+
return false;
350348

351-
// For the moment, restrict ourselves to the case where the access size is a
352-
// multiple of the requested alignment and the base is aligned.
353-
// TODO: generalize if a case found which warrants
354-
if (EltSize.urem(Alignment.value()) != 0)
349+
// For the moment, restrict ourselves to the case where the offset is a
350+
// multiple of the requested alignment and the base is aligned.
351+
// TODO: generalize if a case found which warrants
352+
if (Offset->getAPInt().urem(Alignment.value()) != 0)
353+
return false;
354+
355+
AccessSize = MaxPtrDiff + Offset->getAPInt();
356+
Base = NewBase->getValue();
357+
} else
355358
return false;
359+
360+
Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
356361
return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
357362
HeaderFirstNonPHI, AC, &DT);
358363
}

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 26 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -190,42 +190,29 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
190190
Members.push_back(Index);
191191
}
192192

193-
/// Calculate Start and End points of memory access.
194-
/// Let's assume A is the first access and B is a memory access on N-th loop
195-
/// iteration. Then B is calculated as:
196-
/// B = A + Step*N .
197-
/// Step value may be positive or negative.
198-
/// N is a calculated back-edge taken count:
199-
/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
200-
/// Start and End points are calculated in the following way:
201-
/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
202-
/// where SizeOfElt is the size of single memory access in bytes.
203-
///
204-
/// There is no conflict when the intervals are disjoint:
205-
/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
206-
static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
207-
const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy,
208-
PredicatedScalarEvolution &PSE,
193+
std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
194+
const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount,
195+
ScalarEvolution *SE,
209196
DenseMap<std::pair<const SCEV *, Type *>,
210-
std::pair<const SCEV *, const SCEV *>> &PointerBounds) {
211-
ScalarEvolution *SE = PSE.getSE();
212-
213-
auto [Iter, Ins] = PointerBounds.insert(
214-
{{PtrExpr, AccessTy},
215-
{SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
216-
if (!Ins)
217-
return Iter->second;
197+
std::pair<const SCEV *, const SCEV *>> *PointerBounds) {
198+
std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
199+
if (PointerBounds) {
200+
auto [Iter, Ins] = PointerBounds->insert(
201+
{{PtrExpr, AccessTy},
202+
{SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
203+
if (!Ins)
204+
return Iter->second;
205+
PtrBoundsPair = &Iter->second;
206+
}
218207

219208
const SCEV *ScStart;
220209
const SCEV *ScEnd;
221210

222211
if (SE->isLoopInvariant(PtrExpr, Lp)) {
223212
ScStart = ScEnd = PtrExpr;
224213
} else if (auto *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr)) {
225-
const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount();
226-
227214
ScStart = AR->getStart();
228-
ScEnd = AR->evaluateAtIteration(Ex, *SE);
215+
ScEnd = AR->evaluateAtIteration(MaxBECount, *SE);
229216
const SCEV *Step = AR->getStepRecurrence(*SE);
230217

231218
// For expressions with negative step, the upper bound is ScStart and the
@@ -244,16 +231,18 @@ static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
244231
return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
245232

246233
assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
247-
assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
234+
assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
248235

249236
// Add the size of the pointed element to ScEnd.
250237
auto &DL = Lp->getHeader()->getDataLayout();
251238
Type *IdxTy = DL.getIndexType(PtrExpr->getType());
252239
const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
253240
ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
254241

255-
Iter->second = {ScStart, ScEnd};
256-
return Iter->second;
242+
std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd};
243+
if (PointerBounds)
244+
*PtrBoundsPair = Res;
245+
return Res;
257246
}
258247

259248
/// Calculate Start and End points of memory access using
@@ -263,8 +252,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
263252
unsigned DepSetId, unsigned ASId,
264253
PredicatedScalarEvolution &PSE,
265254
bool NeedsFreeze) {
255+
const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount();
266256
const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
267-
Lp, PtrExpr, AccessTy, PSE, DC.getPointerBounds());
257+
Lp, PtrExpr, AccessTy, MaxBECount, PSE.getSE(), &DC.getPointerBounds());
268258
assert(!isa<SCEVCouldNotCompute>(ScStart) &&
269259
!isa<SCEVCouldNotCompute>(ScEnd) &&
270260
"must be able to compute both start and end expressions");
@@ -1938,10 +1928,11 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
19381928
// required for correctness.
19391929
if (SE.isLoopInvariant(Src, InnermostLoop) ||
19401930
SE.isLoopInvariant(Sink, InnermostLoop)) {
1941-
const auto &[SrcStart_, SrcEnd_] =
1942-
getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds);
1943-
const auto &[SinkStart_, SinkEnd_] =
1944-
getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds);
1931+
const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount();
1932+
const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(
1933+
InnermostLoop, Src, ATy, MaxBECount, PSE.getSE(), &PointerBounds);
1934+
const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(
1935+
InnermostLoop, Sink, BTy, MaxBECount, PSE.getSE(), &PointerBounds);
19451936
if (!isa<SCEVCouldNotCompute>(SrcStart_) &&
19461937
!isa<SCEVCouldNotCompute>(SrcEnd_) &&
19471938
!isa<SCEVCouldNotCompute>(SinkStart_) &&

0 commit comments

Comments
 (0)