-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[VectorCombine] Shrink loads used in shufflevector rebroadcasts #128938
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 12 commits
3a75016
f566b7a
f5c8bf5
157074c
d6c00c0
4d6873f
1861c7d
6534370
5b63d6f
f2c6b30
4dc36ee
1092f5b
265f960
070b6f8
6ca4bfa
0310170
6a9dd0b
019e39f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
#include "llvm/ADT/DenseMap.h" | ||
#include "llvm/ADT/STLExtras.h" | ||
#include "llvm/ADT/ScopeExit.h" | ||
#include "llvm/ADT/SmallVector.h" | ||
#include "llvm/ADT/Statistic.h" | ||
#include "llvm/Analysis/AssumptionCache.h" | ||
#include "llvm/Analysis/BasicAliasAnalysis.h" | ||
|
@@ -32,8 +33,10 @@ | |
#include "llvm/Transforms/Utils/Local.h" | ||
#include "llvm/Transforms/Utils/LoopUtils.h" | ||
#include <numeric> | ||
#include <optional> | ||
#include <queue> | ||
#include <set> | ||
#include <tuple> | ||
|
||
#define DEBUG_TYPE "vector-combine" | ||
#include "llvm/Transforms/Utils/InstructionWorklist.h" | ||
|
@@ -131,6 +134,7 @@ class VectorCombine { | |
bool foldSelectShuffle(Instruction &I, bool FromReduction = false); | ||
bool foldInterleaveIntrinsics(Instruction &I); | ||
bool shrinkType(Instruction &I); | ||
bool shrinkLoadForShuffles(Instruction &I); | ||
|
||
void replaceValue(Value &Old, Value &New) { | ||
LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); | ||
|
@@ -3483,6 +3487,131 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) { | |
return true; | ||
} | ||
|
||
// Attempt to shrink loads that are only used by shufflevector instructions. | ||
bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { | ||
auto *OldLoad = dyn_cast<LoadInst>(&I); | ||
if (!OldLoad || !OldLoad->isSimple()) | ||
return false; | ||
|
||
auto *VecTy = dyn_cast<FixedVectorType>(OldLoad->getType()); | ||
if (!VecTy) | ||
return false; | ||
|
||
// Search all uses of load. If all uses are shufflevector instructions, and | ||
// the second operands are all poison values, find the minimum and maximum | ||
// indices of the vector elements referenced by all shuffle masks. | ||
// Otherwise return `std::nullopt`. | ||
using IndexRange = std::pair<int, int>; | ||
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> { | ||
IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1); | ||
for (auto &Use : I.uses()) { | ||
// All uses must be shufflevector instructions. | ||
auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser()); | ||
if (!Shuffle) | ||
return std::nullopt; | ||
|
||
// Ignore shufflevector instructions that have no uses. | ||
if (!Shuffle->hasNUsesOrMore(1u)) | ||
continue; | ||
|
||
// Ensure second operand is a poison value. | ||
auto *Op0 = Shuffle->getOperand(0); | ||
auto *Op1 = Shuffle->getOperand(1); | ||
PeddleSpam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (!isa<PoisonValue>(Op1) && !isa<UndefValue>(Op1)) | ||
return std::nullopt; | ||
PeddleSpam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// Find the min and max indices used by the shufflevector instruction. | ||
ArrayRef<int> Mask = Shuffle->getShuffleMask(); | ||
auto *Op0Ty = cast<FixedVectorType>(Op0->getType()); | ||
auto NumElems = int(Op0Ty->getNumElements()); | ||
PeddleSpam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
for (int Index : Mask) { | ||
if (Index >= 0) { | ||
Index %= NumElems; | ||
PeddleSpam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
OutputRange.first = std::min(Index, OutputRange.first); | ||
OutputRange.second = std::max(Index, OutputRange.second); | ||
} | ||
} | ||
} | ||
|
||
if (OutputRange.second < OutputRange.first) | ||
return std::nullopt; | ||
|
||
return OutputRange; | ||
}; | ||
|
||
// Get the range of vector elements used by shufflevector instructions. | ||
if (auto Indices = GetIndexRangeInShuffles()) { | ||
PeddleSpam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
unsigned OldSize = VecTy->getNumElements(); | ||
unsigned NewSize = Indices->second + 1u; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. have you investigated using Indices->first as well to trim both ends of the load? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did consider this, which is why it calculates the upper and lower bounds. However, I thought it best to do this as a follow-up PR to isolate the performance impact of any alignment changes. |
||
|
||
// If the range of vector elements is smaller than the full load, attempt | ||
// to create a smaller load. | ||
if (NewSize < OldSize) { | ||
auto Builder = IRBuilder(&I); | ||
Builder.SetCurrentDebugLocation(I.getDebugLoc()); | ||
|
||
// Create new load of smaller vector. | ||
auto *ElemTy = VecTy->getElementType(); | ||
auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize); | ||
auto *PtrOp = OldLoad->getPointerOperand(); | ||
PeddleSpam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
auto *NewLoad = cast<LoadInst>( | ||
Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign())); | ||
NewLoad->copyMetadata(I); | ||
PeddleSpam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// Calculate costs of old and new ops. | ||
auto OldCost = TTI.getMemoryOpCost( | ||
Instruction::Load, OldLoad->getType(), OldLoad->getAlign(), | ||
OldLoad->getPointerAddressSpace(), CostKind); | ||
auto NewCost = TTI.getMemoryOpCost( | ||
PeddleSpam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Instruction::Load, NewLoad->getType(), NewLoad->getAlign(), | ||
NewLoad->getPointerAddressSpace(), CostKind); | ||
PeddleSpam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>; | ||
auto NewUses = SmallVector<UseEntry, 4u>(); | ||
auto SizeDiff = OldSize - NewSize; | ||
|
||
for (auto &Use : I.uses()) { | ||
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser()); | ||
auto OldMask = Shuffle->getShuffleMask(); | ||
|
||
// Create entry for new use. | ||
NewUses.push_back({Shuffle, {}}); | ||
auto &NewMask = NewUses.back().second; | ||
for (auto Index : OldMask) | ||
NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index); | ||
|
||
// Update costs. | ||
OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, OldMask, | ||
CostKind); | ||
NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy, | ||
NewMask, CostKind); | ||
} | ||
|
||
if (OldCost < NewCost || !NewCost.isValid()) { | ||
NewLoad->eraseFromParent(); | ||
return false; | ||
} | ||
|
||
// Replace all uses. | ||
for (auto &Use : NewUses) { | ||
auto *Shuffle = Use.first; | ||
auto &NewMask = Use.second; | ||
|
||
Builder.SetInsertPoint(Shuffle); | ||
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); | ||
auto *NewShuffle = Builder.CreateShuffleVector( | ||
NewLoad, PoisonValue::get(NewVecTy), NewMask); | ||
|
||
replaceValue(*Shuffle, *NewShuffle); | ||
} | ||
|
||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
/// This is the entry point for all transforms. Pass manager differences are | ||
/// handled in the callers of this function. | ||
bool VectorCombine::run() { | ||
|
@@ -3558,6 +3687,9 @@ bool VectorCombine::run() { | |
MadeChange |= foldSelectShuffle(I); | ||
MadeChange |= foldShuffleToIdentity(I); | ||
break; | ||
case Instruction::Load: | ||
MadeChange |= shrinkLoadForShuffles(I); | ||
break; | ||
case Instruction::BitCast: | ||
MadeChange |= foldBitcastShuffle(I); | ||
break; | ||
|
Uh oh!
There was an error while loading. Please reload this page.