@@ -2911,7 +2911,8 @@ class BoUpSLP {
2911
2911
}
2912
2912
if (Last->State != TreeEntry::NeedToGather) {
2913
2913
for (Value *V : VL) {
2914
- assert(!getTreeEntry(V) && "Scalar already in tree!");
2914
+ [[maybe_unused]] const TreeEntry *TE = getTreeEntry(V);
2915
+ assert((!TE || TE == Last) && "Scalar already in tree!");
2915
2916
ScalarToTreeEntry[V] = Last;
2916
2917
}
2917
2918
// Update the scheduler bundle to point to this TreeEntry.
@@ -2924,7 +2925,8 @@ class BoUpSLP {
2924
2925
for (Value *V : VL) {
2925
2926
if (doesNotNeedToBeScheduled(V))
2926
2927
continue;
2927
- assert(BundleMember && "Unexpected end of bundle.");
2928
+ if (!BundleMember)
2929
+ continue;
2928
2930
BundleMember->TE = Last;
2929
2931
BundleMember = BundleMember->NextInBundle;
2930
2932
}
@@ -5583,9 +5585,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5583
5585
5584
5586
SmallVector<int> ReuseShuffleIndicies;
5585
5587
SmallVector<Value *> UniqueValues;
5586
- auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
5587
- &UserTreeIdx ,
5588
- this](const InstructionsState &S ) {
5588
+ SmallVector<Value *> NonUniqueValueVL;
5589
+ auto TryToFindDuplicates = [&](const InstructionsState &S ,
5590
+ bool DoNotFail = false ) {
5589
5591
// Check that every instruction appears once in this bundle.
5590
5592
DenseMap<Value *, unsigned> UniquePositions(VL.size());
5591
5593
for (Value *V : VL) {
@@ -5612,6 +5614,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5612
5614
!isConstant(V);
5613
5615
})) ||
5614
5616
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
5617
+ if (DoNotFail && UniquePositions.size() > 1 &&
5618
+ NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
5619
+ all_of(UniqueValues, [=](Value *V) {
5620
+ return isa<ExtractElementInst>(V) ||
5621
+ areAllUsersVectorized(cast<Instruction>(V),
5622
+ UserIgnoreList);
5623
+ })) {
5624
+ unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
5625
+ if (PWSz == VL.size()) {
5626
+ ReuseShuffleIndicies.clear();
5627
+ } else {
5628
+ NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
5629
+ NonUniqueValueVL.append(PWSz - UniqueValues.size(),
5630
+ UniqueValues.back());
5631
+ VL = NonUniqueValueVL;
5632
+ }
5633
+ return true;
5634
+ }
5615
5635
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
5616
5636
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5617
5637
return false;
@@ -5857,7 +5877,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5857
5877
}
5858
5878
5859
5879
// Check that every instruction appears once in this bundle.
5860
- if (!TryToFindDuplicates(S))
5880
+ if (!TryToFindDuplicates(S, /*DoNotFail=*/true ))
5861
5881
return;
5862
5882
5863
5883
// Perform specific checks for each particular instruction kind.
@@ -5877,7 +5897,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5877
5897
5878
5898
BlockScheduling &BS = *BSRef;
5879
5899
5880
- std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
5900
+ std::optional<ScheduleData *> Bundle =
5901
+ BS.tryScheduleBundle(UniqueValues, this, S);
5881
5902
#ifdef EXPENSIVE_CHECKS
5882
5903
// Make sure we didn't break any internal invariants
5883
5904
BS.verify();
@@ -7537,7 +7558,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7537
7558
Instruction *VL0 = E->getMainOp();
7538
7559
unsigned ShuffleOrOp =
7539
7560
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
7540
- const unsigned Sz = VL.size();
7561
+ SetVector<Value *> UniqueValues(VL.begin(), VL.end());
7562
+ const unsigned Sz = UniqueValues.size();
7541
7563
auto GetCostDiff =
7542
7564
[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
7543
7565
function_ref<InstructionCost(InstructionCost)> VectorCost) {
@@ -7644,7 +7666,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7644
7666
// Count reused scalars.
7645
7667
InstructionCost ScalarCost = 0;
7646
7668
SmallPtrSet<const TreeEntry *, 4> CountedOps;
7647
- for (Value *V : VL ) {
7669
+ for (Value *V : UniqueValues ) {
7648
7670
auto *PHI = dyn_cast<PHINode>(V);
7649
7671
if (!PHI)
7650
7672
continue;
@@ -7665,8 +7687,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7665
7687
}
7666
7688
case Instruction::ExtractValue:
7667
7689
case Instruction::ExtractElement: {
7668
- auto GetScalarCost = [= ](unsigned Idx) {
7669
- auto *I = cast<Instruction>(VL [Idx]);
7690
+ auto GetScalarCost = [& ](unsigned Idx) {
7691
+ auto *I = cast<Instruction>(UniqueValues [Idx]);
7670
7692
VectorType *SrcVecTy;
7671
7693
if (ShuffleOrOp == Instruction::ExtractElement) {
7672
7694
auto *EE = cast<ExtractElementInst>(I);
@@ -7844,9 +7866,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7844
7866
Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
7845
7867
}
7846
7868
}
7847
- auto GetScalarCost = [=](unsigned Idx) {
7848
- auto *VI =
7849
- VL0->getOpcode() == Opcode ? cast<Instruction>(VL[Idx]) : nullptr;
7869
+ auto GetScalarCost = [&](unsigned Idx) {
7870
+ auto *VI = VL0->getOpcode() == Opcode
7871
+ ? cast<Instruction>(UniqueValues[Idx])
7872
+ : nullptr;
7850
7873
return TTI->getCastInstrCost(Opcode, ScalarTy, SrcScalarTy,
7851
7874
TTI::getCastContextHint(VI), CostKind, VI);
7852
7875
};
@@ -7891,7 +7914,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7891
7914
? CmpInst::BAD_FCMP_PREDICATE
7892
7915
: CmpInst::BAD_ICMP_PREDICATE;
7893
7916
auto GetScalarCost = [&](unsigned Idx) {
7894
- auto *VI = cast<Instruction>(VL [Idx]);
7917
+ auto *VI = cast<Instruction>(UniqueValues [Idx]);
7895
7918
CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
7896
7919
? CmpInst::BAD_FCMP_PREDICATE
7897
7920
: CmpInst::BAD_ICMP_PREDICATE;
@@ -7951,8 +7974,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7951
7974
case Instruction::And:
7952
7975
case Instruction::Or:
7953
7976
case Instruction::Xor: {
7954
- auto GetScalarCost = [= ](unsigned Idx) {
7955
- auto *VI = cast<Instruction>(VL [Idx]);
7977
+ auto GetScalarCost = [& ](unsigned Idx) {
7978
+ auto *VI = cast<Instruction>(UniqueValues [Idx]);
7956
7979
unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
7957
7980
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
7958
7981
TTI::OperandValueInfo Op2Info =
@@ -7975,14 +7998,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7975
7998
return CommonCost + GetGEPCostDiff(VL, VL0);
7976
7999
}
7977
8000
case Instruction::Load: {
7978
- auto GetScalarCost = [= ](unsigned Idx) {
7979
- auto *VI = cast<LoadInst>(VL [Idx]);
8001
+ auto GetScalarCost = [& ](unsigned Idx) {
8002
+ auto *VI = cast<LoadInst>(UniqueValues [Idx]);
7980
8003
return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
7981
8004
VI->getPointerAddressSpace(), CostKind,
7982
8005
TTI::OperandValueInfo(), VI);
7983
8006
};
7984
8007
auto *LI0 = cast<LoadInst>(VL0);
7985
- auto GetVectorCost = [= ](InstructionCost CommonCost) {
8008
+ auto GetVectorCost = [& ](InstructionCost CommonCost) {
7986
8009
InstructionCost VecLdCost;
7987
8010
if (E->State == TreeEntry::Vectorize) {
7988
8011
VecLdCost = TTI->getMemoryOpCost(
@@ -7993,7 +8016,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7993
8016
E->State == TreeEntry::PossibleStridedVectorize) &&
7994
8017
"Unknown EntryState");
7995
8018
Align CommonAlignment = LI0->getAlign();
7996
- for (Value *V : VL )
8019
+ for (Value *V : UniqueValues )
7997
8020
CommonAlignment =
7998
8021
std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
7999
8022
VecLdCost = TTI->getGatherScatterOpCost(
@@ -8045,8 +8068,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8045
8068
GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
8046
8069
}
8047
8070
case Instruction::Call: {
8048
- auto GetScalarCost = [= ](unsigned Idx) {
8049
- auto *CI = cast<CallInst>(VL [Idx]);
8071
+ auto GetScalarCost = [& ](unsigned Idx) {
8072
+ auto *CI = cast<CallInst>(UniqueValues [Idx]);
8050
8073
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8051
8074
if (ID != Intrinsic::not_intrinsic) {
8052
8075
IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
@@ -8087,8 +8110,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8087
8110
}
8088
8111
return false;
8089
8112
};
8090
- auto GetScalarCost = [= ](unsigned Idx) {
8091
- auto *VI = cast<Instruction>(VL [Idx]);
8113
+ auto GetScalarCost = [& ](unsigned Idx) {
8114
+ auto *VI = cast<Instruction>(UniqueValues [Idx]);
8092
8115
assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
8093
8116
(void)E;
8094
8117
return TTI->getInstructionCost(VI, CostKind);
@@ -8607,6 +8630,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
8607
8630
SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
8608
8631
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
8609
8632
SmallVector<APInt> DemandedElts;
8633
+ SmallDenseSet<Value *, 4> UsedInserts;
8610
8634
for (ExternalUser &EU : ExternalUses) {
8611
8635
// We only add extract cost once for the same scalar.
8612
8636
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -8627,6 +8651,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
8627
8651
// to detect it as a final shuffled/identity match.
8628
8652
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
8629
8653
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
8654
+ if (!UsedInserts.insert(VU).second)
8655
+ continue;
8630
8656
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
8631
8657
if (InsertIdx) {
8632
8658
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
@@ -11008,6 +11034,7 @@ Value *BoUpSLP::vectorizeTree(
11008
11034
// Maps extract Scalar to the corresponding extractelement instruction in the
11009
11035
// basic block. Only one extractelement per block should be emitted.
11010
11036
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
11037
+ SmallDenseSet<Value *, 4> UsedInserts;
11011
11038
// Extract all of the elements with the external uses.
11012
11039
for (const auto &ExternalUse : ExternalUses) {
11013
11040
Value *Scalar = ExternalUse.Scalar;
@@ -11106,6 +11133,8 @@ Value *BoUpSLP::vectorizeTree(
11106
11133
// Skip if the scalar is another vector op or Vec is not an instruction.
11107
11134
if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
11108
11135
if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
11136
+ if (!UsedInserts.insert(VU).second)
11137
+ continue;
11109
11138
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
11110
11139
if (InsertIdx) {
11111
11140
// Need to use original vector, if the root is truncated.
0 commit comments