Skip to content

Commit 0f1db00

Browse files
committed
[SROA] Optimize reloaded values in allocas that escape into readonly nocapture calls.
Given an alloca that potentially has many uses in complex code and escapes into a call that is readonly+nocapture, we cannot easily split up the alloca. There are several optimizations that will attempt to take a value that is stored and a reload, and replace the load with the original stored value. Instcombine has some simple heiristics, GVN can sometimes do, as can early CSE in limited situations. They all suffer from the same issue with complex code - they start from a load/store and need to prove no-alias for all code between, which in complex cases might be a loti to look through. Especially if the ptr is an alloca with many uses that is over the normal escape capture limits. The pass that does do well with allocas is SROA, as it has a complete view of the alloca and all of its uses. This patch adds a case to SROA where it can detect allocas that are passed into calls that are no-capture readonly. It can then optimize the reloaded values inside the alloca slice with the stored value knowing that it is valid no matter the location of the loads/stores from the no-escaping nature of the alloca.
1 parent 8eb51dd commit 0f1db00

File tree

4 files changed

+104
-12
lines changed

4 files changed

+104
-12
lines changed

llvm/include/llvm/Analysis/PtrUseVisitor.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ class PtrUseVisitorBase {
6464
/// Is the pointer escaped at some point?
6565
bool isEscaped() const { return EscapedInfo != nullptr; }
6666

67+
/// Is the pointer escaped into a read-only nocapture call at some point?
68+
bool isEscapedReadOnly() const { return EscapedReadOnly != nullptr; }
69+
6770
/// Get the instruction causing the visit to abort.
6871
/// \returns a pointer to the instruction causing the abort if one is
6972
/// available; otherwise returns null.
@@ -74,6 +77,10 @@ class PtrUseVisitorBase {
7477
/// is available; otherwise returns null.
7578
Instruction *getEscapingInst() const { return EscapedInfo; }
7679

80+
/// Get the instruction causing the pointer to escape which is a read-only
81+
/// nocapture call.
82+
Instruction *getEscapedReadOnlyInst() const { return EscapedReadOnly; }
83+
7784
/// Mark the visit as aborted. Intended for use in a void return.
7885
/// \param I The instruction which caused the visit to abort, if available.
7986
void setAborted(Instruction *I) {
@@ -88,6 +95,12 @@ class PtrUseVisitorBase {
8895
EscapedInfo = I;
8996
}
9097

98+
/// Mark the pointer as escaped into a readonly-nocapture call.
99+
void setEscapedReadOnly(Instruction *I) {
100+
assert(I && "Expected a valid pointer in setEscapedReadOnly");
101+
EscapedReadOnly = I;
102+
}
103+
91104
/// Mark the pointer as escaped, and the visit as aborted. Intended
92105
/// for use in a void return.
93106
/// \param I The instruction which both escapes the pointer and aborts the
@@ -100,6 +113,7 @@ class PtrUseVisitorBase {
100113
private:
101114
Instruction *AbortedInfo = nullptr;
102115
Instruction *EscapedInfo = nullptr;
116+
Instruction *EscapedReadOnly = nullptr;
103117
};
104118

105119
protected:

llvm/lib/Transforms/Scalar/SROA.cpp

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#include "llvm/Analysis/GlobalsModRef.h"
4444
#include "llvm/Analysis/Loads.h"
4545
#include "llvm/Analysis/PtrUseVisitor.h"
46+
#include "llvm/Analysis/ValueTracking.h"
4647
#include "llvm/Config/llvm-config.h"
4748
#include "llvm/IR/BasicBlock.h"
4849
#include "llvm/IR/Constant.h"
@@ -246,6 +247,7 @@ class SROA {
246247
bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
247248
AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
248249
bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
250+
bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
249251
std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
250252
void clobberUse(Use &U);
251253
bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
@@ -598,6 +600,7 @@ class AllocaSlices {
598600
/// If this is true, the slices are never fully built and should be
599601
/// ignored.
600602
bool isEscaped() const { return PointerEscapingInstr; }
603+
bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
601604

602605
/// Support for iterating over the slices.
603606
/// @{
@@ -680,6 +683,7 @@ class AllocaSlices {
680683
/// store a pointer to that here and abort trying to form slices of the
681684
/// alloca. This will be null if the alloca slices are analyzed successfully.
682685
Instruction *PointerEscapingInstr;
686+
Instruction *PointerEscapingInstrReadOnly;
683687

684688
/// The slices of the alloca.
685689
///
@@ -1390,14 +1394,31 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
13901394

13911395
/// Disable SROA entirely if there are unhandled users of the alloca.
13921396
void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1397+
1398+
void visitCallBase(CallBase &CB) {
1399+
// If the operands that are U are NoCapture ReadOnly, then we mark it as
1400+
// EscapedReadOnly.
1401+
Function *Callee = CB.getCalledFunction();
1402+
if (Callee && CB.arg_size() == Callee->arg_size() &&
1403+
!CB.hasOperandBundles() && all_of(enumerate(CB.args()), [&](auto V) {
1404+
return V.value() != *U ||
1405+
(Callee->getArg(V.index())->hasNoCaptureAttr() &&
1406+
Callee->getArg(V.index())->onlyReadsMemory());
1407+
})) {
1408+
PI.setEscapedReadOnly(&CB);
1409+
return;
1410+
}
1411+
1412+
Base::visitCallBase(CB);
1413+
}
13931414
};
13941415

13951416
AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
13961417
:
13971418
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
13981419
AI(AI),
13991420
#endif
1400-
PointerEscapingInstr(nullptr) {
1421+
PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
14011422
SliceBuilder PB(DL, AI, *this);
14021423
SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
14031424
if (PtrI.isEscaped() || PtrI.isAborted()) {
@@ -1408,6 +1429,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
14081429
assert(PointerEscapingInstr && "Did not track a bad instruction");
14091430
return;
14101431
}
1432+
PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
14111433

14121434
llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
14131435

@@ -1445,6 +1467,9 @@ void AllocaSlices::print(raw_ostream &OS) const {
14451467
return;
14461468
}
14471469

1470+
if (PointerEscapingInstrReadOnly)
1471+
OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1472+
14481473
OS << "Slices of alloca: " << AI << "\n";
14491474
for (const_iterator I = begin(), E = end(); I != E; ++I)
14501475
print(OS, I);
@@ -5454,6 +5479,54 @@ void SROA::clobberUse(Use &U) {
54545479
}
54555480
}
54565481

5482+
bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5483+
for (auto &P : AS.partitions()) {
5484+
StoreInst *Store = nullptr;
5485+
// Make sure all the slices inside the partition are the full width.
5486+
if (any_of(P, [&P](Slice &S) {
5487+
return S.beginOffset() != P.beginOffset() ||
5488+
S.beginOffset() != P.beginOffset();
5489+
}))
5490+
continue;
5491+
5492+
// Check there is a single store and nothing else other than loads.
5493+
for (Slice &S : P) {
5494+
if (S.isDead())
5495+
continue;
5496+
if (auto *St = dyn_cast<StoreInst>(S.getUse()->getUser())) {
5497+
if (Store) {
5498+
Store = nullptr;
5499+
break;
5500+
}
5501+
Store = St;
5502+
} else if (!isa<LoadInst>(S.getUse()->getUser()) &&
5503+
!isAssumeLikeIntrinsic(
5504+
cast<Instruction>(S.getUse()->getUser()))) {
5505+
Store = nullptr;
5506+
break;
5507+
}
5508+
}
5509+
5510+
if (!Store)
5511+
continue;
5512+
5513+
// Replace loads by the value that was stored.
5514+
for (Slice &S : P) {
5515+
if (auto *Ld = dyn_cast<LoadInst>(S.getUse()->getUser())) {
5516+
if (DTU->getDomTree().dominates(Store, Ld)) {
5517+
if (Store->getValueOperand()->getType() == Ld->getType()) {
5518+
LLVM_DEBUG(dbgs() << " Replacing " << *Ld << " with "
5519+
<< *Store->getValueOperand() << "\n");
5520+
Ld->replaceAllUsesWith(Store->getValueOperand());
5521+
}
5522+
}
5523+
}
5524+
}
5525+
}
5526+
5527+
return true;
5528+
}
5529+
54575530
/// Analyze an alloca for SROA.
54585531
///
54595532
/// This analyzes the alloca to ensure we can reason about it, builds
@@ -5494,6 +5567,11 @@ SROA::runOnAlloca(AllocaInst &AI) {
54945567
if (AS.isEscaped())
54955568
return {Changed, CFGChanged};
54965569

5570+
if (AS.isEscapedReadOnly()) {
5571+
Changed |= propagateStoredValuesToLoads(AI, AS);
5572+
return {Changed, CFGChanged};
5573+
}
5574+
54975575
// Delete all the dead users of this alloca before splitting and rewriting it.
54985576
for (Instruction *DeadUser : AS.getDeadUsers()) {
54995577
// Free up everything used by this instruction.

llvm/test/Transforms/SROA/non-capturing-call-readonly.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,7 @@ define [2 x i32] @part_of_alloca_used_in_call(ptr %data, i64 %n) {
485485
; CHECK-NEXT: [[I0:%.*]] = call i32 @user_of_alloca(ptr [[RETVAL]])
486486
; CHECK-NEXT: [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
487487
; CHECK-NEXT: [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
488-
; CHECK-NEXT: [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I1_FCA_0_LOAD]], 0
488+
; CHECK-NEXT: [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
489489
; CHECK-NEXT: [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
490490
; CHECK-NEXT: [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
491491
; CHECK-NEXT: [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
@@ -538,7 +538,7 @@ define [2 x i32] @all_parts_of_alloca_used_in_call_with_multiple_args(ptr %data,
538538
; CHECK-NEXT: [[I0:%.*]] = call i32 @user_of_alloca_with_multiple_args(ptr [[RETVAL]], ptr [[RETVAL_FULL]])
539539
; CHECK-NEXT: [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
540540
; CHECK-NEXT: [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
541-
; CHECK-NEXT: [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I1_FCA_0_LOAD]], 0
541+
; CHECK-NEXT: [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
542542
; CHECK-NEXT: [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
543543
; CHECK-NEXT: [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
544544
; CHECK-NEXT: [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
@@ -701,7 +701,7 @@ define [2 x i32] @part_of_alloca_used_in_call_with_multiple_args(ptr %data, i64
701701
; CHECK-NEXT: [[I0:%.*]] = call i32 @user_of_alloca_with_multiple_args(ptr [[RETVAL]], ptr [[RETVAL]])
702702
; CHECK-NEXT: [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
703703
; CHECK-NEXT: [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
704-
; CHECK-NEXT: [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I1_FCA_0_LOAD]], 0
704+
; CHECK-NEXT: [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
705705
; CHECK-NEXT: [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
706706
; CHECK-NEXT: [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
707707
; CHECK-NEXT: [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
@@ -757,7 +757,7 @@ define [2 x i32] @all_parts_of_alloca_used_in_calls_with_multiple_args(ptr %data
757757
; CHECK-NEXT: [[I2:%.*]] = call i32 @capture_of_alloca(ptr [[SOME_ANOTHER_ALLOCA_FULL]])
758758
; CHECK-NEXT: [[I3_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
759759
; CHECK-NEXT: [[I3_FCA_0_LOAD:%.*]] = load i32, ptr [[I3_FCA_0_GEP]], align 4
760-
; CHECK-NEXT: [[I3_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I3_FCA_0_LOAD]], 0
760+
; CHECK-NEXT: [[I3_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
761761
; CHECK-NEXT: [[I3_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
762762
; CHECK-NEXT: [[I3_FCA_1_LOAD:%.*]] = load i32, ptr [[I3_FCA_1_GEP]], align 4
763763
; CHECK-NEXT: [[I3_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I3_FCA_0_INSERT]], i32 [[I3_FCA_1_LOAD]], 1
@@ -817,7 +817,7 @@ define i64 @do_schedule_instrs_for_dce_after_fixups() {
817817
; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 1
818818
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @user_of_alloca(ptr [[ADD_PTR]])
819819
; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[C]], align 4
820-
; CHECK-NEXT: ret i64 [[LD]]
820+
; CHECK-NEXT: ret i64 0
821821
;
822822
entry:
823823
%c = alloca i64, align 2
@@ -867,7 +867,7 @@ define i8 @transform_load_and_store() {
867867
; CHECK-NEXT: store i8 0, ptr [[A]], align 1
868868
; CHECK-NEXT: call void @byte_user_of_alloca(ptr [[A]])
869869
; CHECK-NEXT: [[R:%.*]] = load i8, ptr [[A]], align 1
870-
; CHECK-NEXT: ret i8 [[R]]
870+
; CHECK-NEXT: ret i8 0
871871
;
872872
entry:
873873
%a = alloca i8

llvm/test/Transforms/SROA/readonlynocapture.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ define i32 @simple() {
99
; CHECK-NEXT: store i32 0, ptr [[A]], align 4
1010
; CHECK-NEXT: call void @callee(ptr [[A]])
1111
; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[A]], align 4
12-
; CHECK-NEXT: ret i32 [[L1]]
12+
; CHECK-NEXT: ret i32 0
1313
;
1414
%a = alloca i32
1515
store i32 0, ptr %a
@@ -27,7 +27,7 @@ define i32 @twoalloc() {
2727
; CHECK-NEXT: call void @callee(ptr [[A]])
2828
; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[A]], align 4
2929
; CHECK-NEXT: [[L2:%.*]] = load i32, ptr [[B]], align 4
30-
; CHECK-NEXT: ret i32 [[L2]]
30+
; CHECK-NEXT: ret i32 1
3131
;
3232
%a = alloca {i32, i32}
3333
store i32 0, ptr %a
@@ -85,7 +85,7 @@ define i32 @twocalls() {
8585
; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[A]], align 4
8686
; CHECK-NEXT: call void @callee(ptr [[A]])
8787
; CHECK-NEXT: [[L2:%.*]] = load i32, ptr [[B]], align 4
88-
; CHECK-NEXT: ret i32 [[L2]]
88+
; CHECK-NEXT: ret i32 1
8989
;
9090
%a = alloca {i32, i32}
9191
store i32 0, ptr %a
@@ -107,7 +107,7 @@ define i32 @volatile() {
107107
; CHECK-NEXT: call void @callee(ptr [[A]])
108108
; CHECK-NEXT: [[L1:%.*]] = load volatile i32, ptr [[A]], align 4
109109
; CHECK-NEXT: [[L2:%.*]] = load volatile i32, ptr [[B]], align 4
110-
; CHECK-NEXT: ret i32 [[L2]]
110+
; CHECK-NEXT: ret i32 1
111111
;
112112
%a = alloca {i32, i32}
113113
store i32 0, ptr %a
@@ -172,7 +172,7 @@ define i32 @multiuse() {
172172
; CHECK-NEXT: call void @callee_multiuse(ptr [[A]], ptr [[A]])
173173
; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[A]], align 4
174174
; CHECK-NEXT: [[L2:%.*]] = load i32, ptr [[B]], align 4
175-
; CHECK-NEXT: ret i32 [[L2]]
175+
; CHECK-NEXT: ret i32 1
176176
;
177177
%a = alloca {i32, i32}
178178
store i32 0, ptr %a

0 commit comments

Comments
 (0)