Skip to content

Commit 962fbaf

Browse files
authored
[LoopVectorize] Refine runtime memory check costs when there is an outer loop (#76034)
When we generate runtime memory checks for an inner loop it's possible that these checks are invariant in the outer loop and so will get hoisted out. In such cases, the effective cost of the checks should reduce to reflect the outer loop trip count. This fixes a 25% performance regression introduced by commit 49b0e6d when building the SPEC2017 x264 benchmark with PGO, where we decided the inner loop trip count wasn't high enough to warrant the (incorrect) high cost of the runtime checks. Also, when runtime memory checks consist entirely of diff checks these are likely to be outer loop invariant.
1 parent 70fbcdb commit 962fbaf

File tree

2 files changed

+273
-6
lines changed

2 files changed

+273
-6
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1950,6 +1950,8 @@ class GeneratedRTChecks {
19501950
bool CostTooHigh = false;
19511951
const bool AddBranchWeights;
19521952

1953+
Loop *OuterLoop = nullptr;
1954+
19531955
public:
19541956
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
19551957
TargetTransformInfo *TTI, const DataLayout &DL,
@@ -2046,6 +2048,9 @@ class GeneratedRTChecks {
20462048
DT->eraseNode(SCEVCheckBlock);
20472049
LI->removeBlock(SCEVCheckBlock);
20482050
}
2051+
2052+
// Outer loop is used as part of the later cost calculations.
2053+
OuterLoop = L->getParentLoop();
20492054
}
20502055

20512056
InstructionCost getCost() {
@@ -2069,16 +2074,61 @@ class GeneratedRTChecks {
20692074
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
20702075
RTCheckCost += C;
20712076
}
2072-
if (MemCheckBlock)
2077+
if (MemCheckBlock) {
2078+
InstructionCost MemCheckCost = 0;
20732079
for (Instruction &I : *MemCheckBlock) {
20742080
if (MemCheckBlock->getTerminator() == &I)
20752081
continue;
20762082
InstructionCost C =
20772083
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
20782084
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2079-
RTCheckCost += C;
2085+
MemCheckCost += C;
20802086
}
20812087

2088+
// If the runtime memory checks are being created inside an outer loop
2089+
// we should find out if these checks are outer loop invariant. If so,
2090+
// the checks will likely be hoisted out and so the effective cost will
2091+
// reduce according to the outer loop trip count.
2092+
if (OuterLoop) {
2093+
ScalarEvolution *SE = MemCheckExp.getSE();
2094+
// TODO: If profitable, we could refine this further by analysing every
2095+
// individual memory check, since there could be a mixture of loop
2096+
// variant and invariant checks that mean the final condition is
2097+
// variant.
2098+
const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2099+
if (SE->isLoopInvariant(Cond, OuterLoop)) {
2100+
// It seems reasonable to assume that we can reduce the effective
2101+
// cost of the checks even when we know nothing about the trip
2102+
// count. Assume that the outer loop executes at least twice.
2103+
unsigned BestTripCount = 2;
2104+
2105+
// If exact trip count is known use that.
2106+
if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2107+
BestTripCount = SmallTC;
2108+
else if (LoopVectorizeWithBlockFrequency) {
2109+
// Else use profile data if available.
2110+
if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2111+
BestTripCount = *EstimatedTC;
2112+
}
2113+
2114+
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2115+
2116+
// Let's ensure the cost is always at least 1.
2117+
NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2118+
(InstructionCost::CostType)1);
2119+
2120+
LLVM_DEBUG(dbgs()
2121+
<< "We expect runtime memory checks to be hoisted "
2122+
<< "out of the outer loop. Cost reduced from "
2123+
<< MemCheckCost << " to " << NewMemCheckCost << '\n');
2124+
2125+
MemCheckCost = NewMemCheckCost;
2126+
}
2127+
}
2128+
2129+
RTCheckCost += MemCheckCost;
2130+
}
2131+
20822132
if (SCEVCheckBlock || MemCheckBlock)
20832133
LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
20842134
<< "\n");
@@ -2137,8 +2187,8 @@ class GeneratedRTChecks {
21372187

21382188
BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
21392189
// Create new preheader for vector loop.
2140-
if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2141-
PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2190+
if (OuterLoop)
2191+
OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
21422192

21432193
SCEVCheckBlock->getTerminator()->eraseFromParent();
21442194
SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
@@ -2172,8 +2222,8 @@ class GeneratedRTChecks {
21722222
DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
21732223
MemCheckBlock->moveBefore(LoopVectorPreHeader);
21742224

2175-
if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2176-
PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2225+
if (OuterLoop)
2226+
OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
21772227

21782228
BranchInst &BI =
21792229
*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
; REQUIRES: asserts
2+
; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %off, i64 noundef %n) {
7+
; CHECK-LABEL: LV: Checking a loop in 'no_outer_loop'
8+
; CHECK: Calculating cost of runtime checks:
9+
; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop.
10+
; CHECK: Total cost of runtime checks: 4
11+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
12+
entry:
13+
br label %inner.loop
14+
15+
inner.loop:
16+
%inner.iv = phi i64 [ 0, %entry ], [ %inner.iv.next, %inner.loop ]
17+
%add.us = add nuw nsw i64 %inner.iv, %off
18+
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
19+
%0 = load i8, ptr %arrayidx.us, align 1
20+
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
21+
%1 = load i8, ptr %arrayidx7.us, align 1
22+
%add9.us = add i8 %1, %0
23+
store i8 %add9.us, ptr %arrayidx7.us, align 1
24+
%inner.iv.next = add nuw nsw i64 %inner.iv, 1
25+
%exitcond.not = icmp eq i64 %inner.iv.next, %n
26+
br i1 %exitcond.not, label %inner.exit, label %inner.loop
27+
28+
inner.exit:
29+
ret void
30+
}
31+
32+
define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
33+
; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
34+
; CHECK: Calculating cost of runtime checks:
35+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
36+
; CHECK: Total cost of runtime checks: 3
37+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
38+
entry:
39+
br label %outer.loop
40+
41+
outer.loop:
42+
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
43+
%mul.us = mul nsw i64 %outer.iv, %n
44+
br label %inner.loop
45+
46+
inner.loop:
47+
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
48+
%add.us = add nuw nsw i64 %inner.iv, %mul.us
49+
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
50+
%0 = load i8, ptr %arrayidx.us, align 1
51+
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
52+
%1 = load i8, ptr %arrayidx7.us, align 1
53+
%add9.us = add i8 %1, %0
54+
store i8 %add9.us, ptr %arrayidx7.us, align 1
55+
%inner.iv.next = add nuw nsw i64 %inner.iv, 1
56+
%exitcond.not = icmp eq i64 %inner.iv.next, %n
57+
br i1 %exitcond.not, label %inner.exit, label %inner.loop
58+
59+
inner.exit:
60+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
61+
%exitcond27.not = icmp eq i64 %outer.iv.next, %m
62+
br i1 %exitcond27.not, label %outer.exit, label %outer.loop
63+
64+
outer.exit:
65+
ret void
66+
}
67+
68+
69+
define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
70+
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
71+
; CHECK: Calculating cost of runtime checks:
72+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
73+
; CHECK: Total cost of runtime checks: 2
74+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
75+
entry:
76+
br label %outer.loop
77+
78+
outer.loop:
79+
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
80+
%mul.us = mul nsw i64 %outer.iv, %n
81+
br label %inner.loop
82+
83+
inner.loop:
84+
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
85+
%add.us = add nuw nsw i64 %inner.iv, %mul.us
86+
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
87+
%0 = load i8, ptr %arrayidx.us, align 1
88+
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
89+
%1 = load i8, ptr %arrayidx7.us, align 1
90+
%add9.us = add i8 %1, %0
91+
store i8 %add9.us, ptr %arrayidx7.us, align 1
92+
%inner.iv.next = add nuw nsw i64 %inner.iv, 1
93+
%exitcond.not = icmp eq i64 %inner.iv.next, %n
94+
br i1 %exitcond.not, label %inner.exit, label %inner.loop
95+
96+
inner.exit:
97+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
98+
%exitcond26.not = icmp eq i64 %outer.iv.next, 3
99+
br i1 %exitcond26.not, label %outer.exit, label %outer.loop
100+
101+
outer.exit:
102+
ret void
103+
}
104+
105+
106+
define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
107+
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
108+
; CHECK: Calculating cost of runtime checks:
109+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
110+
; CHECK: Total cost of runtime checks: 1
111+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
112+
entry:
113+
br label %outer.loop
114+
115+
outer.loop:
116+
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
117+
%mul.us = mul nsw i64 %outer.iv, %n
118+
br label %inner.loop
119+
120+
inner.loop:
121+
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
122+
%add.us = add nuw nsw i64 %inner.iv, %mul.us
123+
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
124+
%0 = load i8, ptr %arrayidx.us, align 1
125+
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
126+
%1 = load i8, ptr %arrayidx7.us, align 1
127+
%add9.us = add i8 %1, %0
128+
store i8 %add9.us, ptr %arrayidx7.us, align 1
129+
%inner.iv.next = add nuw nsw i64 %inner.iv, 1
130+
%exitcond.not = icmp eq i64 %inner.iv.next, %n
131+
br i1 %exitcond.not, label %inner.exit, label %inner.loop
132+
133+
inner.exit:
134+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
135+
%exitcond26.not = icmp eq i64 %outer.iv.next, 64
136+
br i1 %exitcond26.not, label %outer.exit, label %outer.loop
137+
138+
outer.exit:
139+
ret void
140+
}
141+
142+
143+
define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
144+
; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
145+
; CHECK: Calculating cost of runtime checks:
146+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
147+
; CHECK: Total cost of runtime checks: 2
148+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
149+
entry:
150+
br label %outer.loop
151+
152+
outer.loop:
153+
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
154+
%mul.us = mul nsw i64 %outer.iv, %n
155+
br label %inner.loop
156+
157+
inner.loop:
158+
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
159+
%add.us = add nuw nsw i64 %inner.iv, %mul.us
160+
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
161+
%0 = load i8, ptr %arrayidx.us, align 1
162+
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
163+
%1 = load i8, ptr %arrayidx7.us, align 1
164+
%add9.us = add i8 %1, %0
165+
store i8 %add9.us, ptr %arrayidx7.us, align 1
166+
%inner.iv.next = add nuw nsw i64 %inner.iv, 1
167+
%exitcond.not = icmp eq i64 %inner.iv.next, %n
168+
br i1 %exitcond.not, label %inner.exit, label %inner.loop
169+
170+
inner.exit:
171+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
172+
%exitcond26.not = icmp eq i64 %outer.iv.next, %m
173+
br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
174+
175+
outer.exit:
176+
ret void
177+
}
178+
179+
180+
define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
181+
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
182+
; CHECK: Calculating cost of runtime checks:
183+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
184+
; CHECK: Total cost of runtime checks: 2
185+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
186+
entry:
187+
br label %outer.loop
188+
189+
outer.loop:
190+
%outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
191+
%0 = mul nsw i64 %outer.iv, %n
192+
br label %inner.loop
193+
194+
inner.loop:
195+
%iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
196+
%1 = add nuw nsw i64 %iv.inner, %0
197+
%arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
198+
%2 = load i32, ptr %arrayidx.us, align 4
199+
%arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
200+
%3 = load i32, ptr %arrayidx8.us, align 4
201+
%add9.us = add nsw i32 %3, %2
202+
store i32 %add9.us, ptr %arrayidx8.us, align 4
203+
%iv.inner.next = add nuw nsw i64 %iv.inner, 1
204+
%inner.exit.cond = icmp eq i64 %iv.inner.next, %n
205+
br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
206+
207+
inner.exit:
208+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
209+
%outer.exit.cond = icmp eq i64 %outer.iv.next, 3
210+
br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
211+
212+
outer.exit:
213+
ret void
214+
}
215+
216+
217+
!0 = !{!"branch_weights", i32 10, i32 20}

0 commit comments

Comments
 (0)