Skip to content

[LAA] Use PSE::getSymbolicMaxBackedgeTakenCount. #93499

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 33 additions & 30 deletions llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ getStartAndEndForAccess(const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy,
if (SE->isLoopInvariant(PtrExpr, Lp)) {
ScStart = ScEnd = PtrExpr;
} else if (auto *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr)) {
const SCEV *Ex = PSE.getBackedgeTakenCount();
const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount();

ScStart = AR->getStart();
ScEnd = AR->evaluateAtIteration(Ex, *SE);
Expand Down Expand Up @@ -1796,28 +1796,28 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
/// Given a dependence-distance \p Dist between two
/// memory accesses, that have strides in the same direction whose absolute
/// value of the maximum stride is given in \p MaxStride, and that have the same
/// type size \p TypeByteSize, in a loop whose takenCount is \p
/// BackedgeTakenCount, check if it is possible to prove statically that the
/// dependence distance is larger than the range that the accesses will travel
/// through the execution of the loop. If so, return true; false otherwise. This
/// is useful for example in loops such as the following (PR31098):
/// type size \p TypeByteSize, in a loop whose maximum backedge taken count is
/// \p MaxBTC, check if it is possible to prove statically that the dependence
/// distance is larger than the range that the accesses will travel through the
/// execution of the loop. If so, return true; false otherwise. This is useful
/// for example in loops such as the following (PR31098):
/// for (i = 0; i < D; ++i) {
/// = out[i];
/// out[i+D] =
/// }
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
const SCEV &BackedgeTakenCount,
const SCEV &Dist, uint64_t MaxStride,
const SCEV &MaxBTC, const SCEV &Dist,
uint64_t MaxStride,
uint64_t TypeByteSize) {

// If we can prove that
// (**) |Dist| > BackedgeTakenCount * Step
// (**) |Dist| > MaxBTC * Step
// where Step is the absolute stride of the memory accesses in bytes,
// then there is no dependence.
//
// Rationale:
// We basically want to check if the absolute distance (|Dist/Step|)
// is >= the loop iteration count (or > BackedgeTakenCount).
// is >= the loop iteration count (or > MaxBTC).
// This is equivalent to the Strong SIV Test (Practical Dependence Testing,
// Section 4.2.1); Note, that for vectorization it is sufficient to prove
// that the dependence distance is >= VF; This is checked elsewhere.
Expand All @@ -1828,8 +1828,8 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
// also guarantees that distance >= VF.
//
const uint64_t ByteStride = MaxStride * TypeByteSize;
const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
const SCEV *Step = SE.getConstant(MaxBTC.getType(), ByteStride);
const SCEV *Product = SE.getMulExpr(&MaxBTC, Step);

const SCEV *CastedDist = &Dist;
const SCEV *CastedProduct = Product;
Expand All @@ -1844,13 +1844,13 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
else
CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());

// Is Dist - (BackedgeTakenCount * Step) > 0 ?
// Is Dist - (MaxBTC * Step) > 0 ?
// (If so, then we have proven (**) because |Dist| >= Dist)
const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
if (SE.isKnownPositive(Minus))
return true;

// Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
// Second try: Is -Dist - (MaxBTC * Step) > 0 ?
// (If so, then we have proven (**) because |Dist| >= -1*Dist)
const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
Minus = SE.getMinusSCEV(NegDist, CastedProduct);
Expand Down Expand Up @@ -2034,12 +2034,13 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
uint64_t MaxStride = std::max(StrideA, StrideB);

// If the distance between the acecsses is larger than their maximum absolute
// stride multiplied by the backedge taken count, the accesses are independet,
// i.e. they are far enough appart that accesses won't access the same
// location across all loop ierations.
if (HasSameSize &&
isSafeDependenceDistance(DL, SE, *(PSE.getBackedgeTakenCount()), *Dist,
MaxStride, TypeByteSize))
// stride multiplied by the symbolic maximum backedge taken count (which is an
// upper bound of the number of iterations), the accesses are independet, i.e.
// they are far enough appart that accesses won't access the same location
// across all loop ierations.
if (HasSameSize && isSafeDependenceDistance(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The doc comment on isSafeDependenceDistance needs updated. I think it's correct, but there's a difference between an exact BTC and a bound on BTC.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks!

DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
*Dist, MaxStride, TypeByteSize))
return Dependence::NoDep;

const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
Expand Down Expand Up @@ -2374,8 +2375,10 @@ bool LoopAccessInfo::canAnalyzeLoop() {
return false;
}

// ScalarEvolution needs to be able to find the exit count.
const SCEV *ExitCount = PSE->getBackedgeTakenCount();
// ScalarEvolution needs to be able to find the symbolic max backedge taken
// count, which is an upper bound on the number of loop iterations. The loop
// may execute fewer iterations, if it exits via an uncountable exit.
const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update the comments to say bound on the btc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks!

if (isa<SCEVCouldNotCompute>(ExitCount)) {
recordAnalysis("CantComputeNumberOfIterations")
<< "could not determine number of loop iterations";
Expand Down Expand Up @@ -2984,25 +2987,25 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
// of various possible stride specializations, considering the alternatives
// of using gather/scatters (if available).

const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount();

// Match the types so we can compare the stride and the BETakenCount.
// Match the types so we can compare the stride and the MaxBTC.
// The Stride can be positive/negative, so we sign extend Stride;
// The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
// The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
uint64_t BETypeSizeBits = DL.getTypeSizeInBits(BETakenCount->getType());
uint64_t BETypeSizeBits = DL.getTypeSizeInBits(MaxBTC->getType());
const SCEV *CastedStride = StrideExpr;
const SCEV *CastedBECount = BETakenCount;
const SCEV *CastedBECount = MaxBTC;
ScalarEvolution *SE = PSE->getSE();
if (BETypeSizeBits >= StrideTypeSizeBits)
CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
CastedStride = SE->getNoopOrSignExtend(StrideExpr, MaxBTC->getType());
else
CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
CastedBECount = SE->getZeroExtendExpr(MaxBTC, StrideExpr->getType());
const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
// Since TripCount == BackEdgeTakenCount + 1, checking:
// "Stride >= TripCount" is equivalent to checking:
// Stride - BETakenCount > 0
// Stride - MaxBTC> 0
if (SE->isKnownPositive(StrideMinusBETaken)) {
LLVM_DEBUG(
dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2079,7 +2079,7 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
// FIXME: It is highly suspicious that we're ignoring the predicates here.
SmallVector<const SCEVPredicate *, 4> Pred;
const SCEV *ExitCount =
SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred);
SE.getPredicatedSymbolicMaxBackedgeTakenCount(AR->getLoop(), Pred);

assert(!isa<SCEVCouldNotCompute>(ExitCount) && "Invalid loop count");

Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,16 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
return false;
}

if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about the other users of LAA in tree? Have you audited them? If not, can you add bailouts to ensure we're not breaking anything with this transition?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked LoopVersioningLICM, LoopDistribute.cpp & LoopLoadElimination.cpp, they all check for a single exiting block, which should be sufficient I think. I also added tests for loop-distribute & loop-load-elim.

reportVectorizationFailure("could not determine number of loop iterations",
"could not determine number of loop iterations",
"CantComputeNumberOfIterations", ORE, TheLoop);
if (DoExtraAnalysis)
Result = false;
else
return false;
}

LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerChecking()->Need
? " (with a runtime bound check)"
Expand Down
39 changes: 36 additions & 3 deletions llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,21 @@
define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations(ptr %A, ptr %B) {
; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations'
; CHECK-NEXT: loop.header:
; CHECK-NEXT: Report: could not determine number of loop iterations
; CHECK-NEXT: Memory dependences are safe with run-time checks
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Check 0:
; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]):
; CHECK-NEXT: %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]):
; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
; CHECK-NEXT: Grouped accesses:
; CHECK-NEXT: Group [[GRP1]]:
; CHECK-NEXT: (Low: %B High: (2000 + %B))
; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header>
; CHECK-NEXT: Group [[GRP2]]:
; CHECK-NEXT: (Low: %A High: (2000 + %A))
; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header>
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
Expand Down Expand Up @@ -53,10 +64,21 @@ e.2:
define i32 @all_exits_dominate_latch_countable_exits_at_most_1000_iterations(ptr %A, ptr %B) {
; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_1000_iterations'
; CHECK-NEXT: loop.header:
; CHECK-NEXT: Report: could not determine number of loop iterations
; CHECK-NEXT: Memory dependences are safe with run-time checks
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Check 0:
; CHECK-NEXT: Comparing group ([[GRP3:0x[0-9a-f]+]]):
; CHECK-NEXT: %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
; CHECK-NEXT: Against group ([[GRP4:0x[0-9a-f]+]]):
; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
; CHECK-NEXT: Grouped accesses:
; CHECK-NEXT: Group [[GRP3]]:
; CHECK-NEXT: (Low: %B High: (4004 + %B))
; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header>
; CHECK-NEXT: Group [[GRP4]]:
; CHECK-NEXT: (Low: %A High: (4004 + %A))
; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header>
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
Expand Down Expand Up @@ -145,10 +167,21 @@ e.2:
define i32 @b3_does_not_dominate_latch(ptr %A, ptr %B) {
; CHECK-LABEL: 'b3_does_not_dominate_latch'
; CHECK-NEXT: loop.header:
; CHECK-NEXT: Report: could not determine number of loop iterations
; CHECK-NEXT: Memory dependences are safe with run-time checks
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Check 0:
; CHECK-NEXT: Comparing group ([[GRP5:0x[0-9a-f]+]]):
; CHECK-NEXT: %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
; CHECK-NEXT: Against group ([[GRP6:0x[0-9a-f]+]]):
; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
; CHECK-NEXT: Grouped accesses:
; CHECK-NEXT: Group [[GRP5]]:
; CHECK-NEXT: (Low: %B High: (4004 + %B))
; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header>
; CHECK-NEXT: Group [[GRP6]]:
; CHECK-NEXT: (Low: %A High: (4004 + %A))
; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header>
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
Expand Down
96 changes: 96 additions & 0 deletions llvm/test/Transforms/LoopDistribute/early-exit.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; REQUIRES: x86-registered-target
; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S %s | FileCheck %s

target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.10.0"

@B = common global ptr null, align 8
@A = common global ptr null, align 8
@C = common global ptr null, align 8
@D = common global ptr null, align 8
@E = common global ptr null, align 8

define void @f() {
; CHECK-LABEL: define void @f() {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[A:%.*]] = load ptr, ptr @A, align 8
; CHECK-NEXT: [[B:%.*]] = load ptr, ptr @B, align 8
; CHECK-NEXT: [[C:%.*]] = load ptr, ptr @C, align 8
; CHECK-NEXT: [[D:%.*]] = load ptr, ptr @D, align 8
; CHECK-NEXT: [[E:%.*]] = load ptr, ptr @E, align 8
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[IND:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[LATCH:.*]] ]
; CHECK-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IND]]
; CHECK-NEXT: [[LOADA:%.*]] = load i32, ptr [[ARRAYIDXA]], align 4
; CHECK-NEXT: [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IND]]
; CHECK-NEXT: [[LOADB:%.*]] = load i32, ptr [[ARRAYIDXB]], align 4
; CHECK-NEXT: [[UNCOUNTABLE_C:%.*]] = icmp eq i32 [[LOADB]], 10
; CHECK-NEXT: br i1 [[UNCOUNTABLE_C]], label %[[FOR_END:.*]], label %[[LATCH]]
; CHECK: [[LATCH]]:
; CHECK-NEXT: [[MULA:%.*]] = mul i32 [[LOADB]], [[LOADA]]
; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[IND]], 1
; CHECK-NEXT: [[ARRAYIDXA_PLUS_4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[ADD]]
; CHECK-NEXT: store i32 [[MULA]], ptr [[ARRAYIDXA_PLUS_4]], align 4
; CHECK-NEXT: [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[IND]]
; CHECK-NEXT: [[LOADD:%.*]] = load i32, ptr [[ARRAYIDXD]], align 4
; CHECK-NEXT: [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, ptr [[E]], i64 [[IND]]
; CHECK-NEXT: [[LOADE:%.*]] = load i32, ptr [[ARRAYIDXE]], align 4
; CHECK-NEXT: [[MULC:%.*]] = mul i32 [[LOADD]], [[LOADE]]
; CHECK-NEXT: [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IND]]
; CHECK-NEXT: store i32 [[MULC]], ptr [[ARRAYIDXC]], align 4
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
; CHECK: [[FOR_END]]:
; CHECK-NEXT: ret void
;
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
%c = load ptr, ptr @C, align 8
%d = load ptr, ptr @D, align 8
%e = load ptr, ptr @E, align 8
br label %for.body

for.body:
%ind = phi i64 [ 0, %entry ], [ %add, %latch ]

%arrayidxA = getelementptr inbounds i32, ptr %a, i64 %ind
%loadA = load i32, ptr %arrayidxA, align 4

%arrayidxB = getelementptr inbounds i32, ptr %b, i64 %ind
%loadB = load i32, ptr %arrayidxB, align 4
%uncountable.c = icmp eq i32 %loadB, 10
br i1 %uncountable.c, label %for.end, label %latch

latch:
%mulA = mul i32 %loadB, %loadA

%add = add nuw nsw i64 %ind, 1
%arrayidxA_plus_4 = getelementptr inbounds i32, ptr %a, i64 %add
store i32 %mulA, ptr %arrayidxA_plus_4, align 4

%arrayidxD = getelementptr inbounds i32, ptr %d, i64 %ind
%loadD = load i32, ptr %arrayidxD, align 4

%arrayidxE = getelementptr inbounds i32, ptr %e, i64 %ind
%loadE = load i32, ptr %arrayidxE, align 4

%mulC = mul i32 %loadD, %loadE

%arrayidxC = getelementptr inbounds i32, ptr %c, i64 %ind
store i32 %mulC, ptr %arrayidxC, align 4

%exitcond = icmp eq i64 %add, 20
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret void
}

attributes #0 = { nounwind readnone convergent }
attributes #1 = { nounwind convergent }

!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
61 changes: 61 additions & 0 deletions llvm/test/Transforms/LoopLoadElim/early-exit.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=loop-load-elim -S %s | FileCheck %s

target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"

define void @f(ptr %A, ptr %B, ptr %C, i64 %N) {
; CHECK-LABEL: define void @f(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[FOR_BODY_LVER_CHECK:.*]]:
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_LVER_CHECK]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[LATCH:.*]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[AIDX_NEXT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
; CHECK-NEXT: [[BIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[CIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[AIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[B:%.*]] = load i32, ptr [[BIDX]], align 4
; CHECK-NEXT: [[UNCOUNTABLE_C:%.*]] = icmp eq i32 [[B]], 10
; CHECK-NEXT: br i1 [[UNCOUNTABLE_C]], label %[[LATCH]], label %[[FOR_END:.*]]
; CHECK: [[LATCH]]:
; CHECK-NEXT: [[A_P1:%.*]] = add i32 [[B]], 2
; CHECK-NEXT: store i32 [[A_P1]], ptr [[AIDX_NEXT]], align 4
; CHECK-NEXT: [[A:%.*]] = load i32, ptr [[AIDX]], align 1
; CHECK-NEXT: [[C:%.*]] = mul i32 [[A]], 2
; CHECK-NEXT: store i32 [[C]], ptr [[CIDX]], align 4
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
; CHECK: [[FOR_END]]:
; CHECK-NEXT: ret void
;
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %latch ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

%Aidx_next = getelementptr inbounds i32, ptr %A, i64 %indvars.iv.next
%Bidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
%Cidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
%Aidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv

%b = load i32, ptr %Bidx, align 4
%uncountable.c = icmp eq i32 %b, 10
br i1 %uncountable.c, label %latch, label %for.end

latch:
%a_p1 = add i32 %b, 2
store i32 %a_p1, ptr %Aidx_next, align 4

%a = load i32, ptr %Aidx, align 1
%c = mul i32 %a, 2
store i32 %c, ptr %Cidx, align 4

%exitcond = icmp eq i64 %indvars.iv.next, %N
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret void
}
Loading
Loading