@@ -2864,6 +2864,12 @@ class BoUpSLP {
2864
2864
/// avoid issues with def-use order.
2865
2865
Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2866
2866
2867
+ TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
2868
+ const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
2869
+ unsigned NodeIdx) const {
2870
+ return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
2871
+ }
2872
+
2867
2873
/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2868
2874
/// \p E.
2869
2875
/// \param PostponedPHIs true, if need to postpone emission of phi nodes to
@@ -6964,6 +6970,55 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6964
6970
}
6965
6971
}
6966
6972
6973
+ // Check if this is a duplicate of another entry.
6974
+ if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6975
+ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6976
+ if (!E->isSame(VL)) {
6977
+ auto It = MultiNodeScalars.find(S.OpValue);
6978
+ if (It != MultiNodeScalars.end()) {
6979
+ auto *TEIt = find_if(It->getSecond(),
6980
+ [&](TreeEntry *ME) { return ME->isSame(VL); });
6981
+ if (TEIt != It->getSecond().end())
6982
+ E = *TEIt;
6983
+ else
6984
+ E = nullptr;
6985
+ } else {
6986
+ E = nullptr;
6987
+ }
6988
+ }
6989
+ if (!E) {
6990
+ if (!doesNotNeedToBeScheduled(S.OpValue)) {
6991
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6992
+ if (TryToFindDuplicates(S))
6993
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6994
+ ReuseShuffleIndices);
6995
+ return;
6996
+ }
6997
+ SmallPtrSet<const TreeEntry *, 4> Nodes;
6998
+ Nodes.insert(getTreeEntry(S.OpValue));
6999
+ for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue))
7000
+ Nodes.insert(E);
7001
+ SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
7002
+ if (any_of(Nodes, [&](const TreeEntry *E) {
7003
+ return all_of(E->Scalars,
7004
+ [&](Value *V) { return Values.contains(V); });
7005
+ })) {
7006
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
7007
+ if (TryToFindDuplicates(S))
7008
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7009
+ ReuseShuffleIndices);
7010
+ return;
7011
+ }
7012
+ } else {
7013
+ // Record the reuse of the tree node. FIXME, currently this is only used
7014
+ // to properly draw the graph rather than for the actual vectorization.
7015
+ E->UserTreeIndices.push_back(UserTreeIdx);
7016
+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
7017
+ << ".\n");
7018
+ return;
7019
+ }
7020
+ }
7021
+
6967
7022
// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6968
7023
// a load), in which case peek through to include it in the tree, without
6969
7024
// ballooning over-budget.
@@ -7095,55 +7150,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7095
7150
// We now know that this is a vector of instructions of the same type from
7096
7151
// the same block.
7097
7152
7098
- // Check if this is a duplicate of another entry.
7099
- if (TreeEntry *E = getTreeEntry(S.OpValue)) {
7100
- LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
7101
- if (!E->isSame(VL)) {
7102
- auto It = MultiNodeScalars.find(S.OpValue);
7103
- if (It != MultiNodeScalars.end()) {
7104
- auto *TEIt = find_if(It->getSecond(),
7105
- [&](TreeEntry *ME) { return ME->isSame(VL); });
7106
- if (TEIt != It->getSecond().end())
7107
- E = *TEIt;
7108
- else
7109
- E = nullptr;
7110
- } else {
7111
- E = nullptr;
7112
- }
7113
- }
7114
- if (!E) {
7115
- if (!doesNotNeedToBeScheduled(S.OpValue)) {
7116
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
7117
- if (TryToFindDuplicates(S))
7118
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7119
- ReuseShuffleIndices);
7120
- return;
7121
- }
7122
- SmallPtrSet<const TreeEntry *, 4> Nodes;
7123
- Nodes.insert(getTreeEntry(S.OpValue));
7124
- for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue))
7125
- Nodes.insert(E);
7126
- SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
7127
- if (any_of(Nodes, [&](const TreeEntry *E) {
7128
- return all_of(E->Scalars,
7129
- [&](Value *V) { return Values.contains(V); });
7130
- })) {
7131
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
7132
- if (TryToFindDuplicates(S))
7133
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7134
- ReuseShuffleIndices);
7135
- return;
7136
- }
7137
- } else {
7138
- // Record the reuse of the tree node. FIXME, currently this is only used
7139
- // to properly draw the graph rather than for the actual vectorization.
7140
- E->UserTreeIndices.push_back(UserTreeIdx);
7141
- LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
7142
- << ".\n");
7143
- return;
7144
- }
7145
- }
7146
-
7147
7153
// Check that none of the instructions in the bundle are already in the tree.
7148
7154
for (Value *V : VL) {
7149
7155
if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
@@ -9362,22 +9368,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
9362
9368
9363
9369
const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
9364
9370
unsigned Idx) const {
9365
- Value *Op = E->getOperand(Idx).front();
9366
- if (const TreeEntry *TE = getTreeEntry(Op)) {
9367
- if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9368
- return EI.EdgeIdx == Idx && EI.UserTE == E;
9369
- }) != TE->UserTreeIndices.end())
9370
- return TE;
9371
- auto MIt = MultiNodeScalars.find(Op);
9372
- if (MIt != MultiNodeScalars.end()) {
9373
- for (const TreeEntry *TE : MIt->second) {
9374
- if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9375
- return EI.EdgeIdx == Idx && EI.UserTE == E;
9376
- }) != TE->UserTreeIndices.end())
9377
- return TE;
9378
- }
9379
- }
9380
- }
9371
+ if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
9372
+ return VE;
9381
9373
const auto *It =
9382
9374
find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9383
9375
return TE->isGather() &&
@@ -12521,120 +12513,123 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
12521
12513
}
12522
12514
};
12523
12515
12524
- Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12525
- bool PostponedPHIs) {
12526
- ValueList &VL = E->getOperand(NodeIdx);
12527
- const unsigned VF = VL.size();
12516
+ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
12517
+ unsigned NodeIdx) {
12518
+ ArrayRef<Value *> VL = E->getOperand(NodeIdx);
12528
12519
InstructionsState S = getSameOpcode(VL, *TLI);
12529
12520
// Special processing for GEPs bundle, which may include non-gep values.
12530
12521
if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12531
12522
const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
12532
12523
if (It != VL.end())
12533
12524
S = getSameOpcode(*It, *TLI);
12534
12525
}
12535
- if (S.getOpcode()) {
12536
- auto CheckSameVE = [&](const TreeEntry *VE) {
12537
- return VE->isSame(VL) &&
12538
- (any_of(VE->UserTreeIndices,
12539
- [E, NodeIdx](const EdgeInfo &EI) {
12540
- return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12541
- }) ||
12542
- any_of(VectorizableTree,
12543
- [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12544
- return TE->isOperandGatherNode({E, NodeIdx}) &&
12545
- VE->isSame(TE->Scalars);
12546
- }));
12526
+ if (!S.getOpcode())
12527
+ return nullptr;
12528
+ auto CheckSameVE = [&](const TreeEntry *VE) {
12529
+ return VE->isSame(VL) &&
12530
+ (any_of(VE->UserTreeIndices,
12531
+ [E, NodeIdx](const EdgeInfo &EI) {
12532
+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12533
+ }) ||
12534
+ any_of(VectorizableTree,
12535
+ [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12536
+ return TE->isOperandGatherNode(
12537
+ {const_cast<TreeEntry *>(E), NodeIdx}) &&
12538
+ VE->isSame(TE->Scalars);
12539
+ }));
12540
+ };
12541
+ TreeEntry *VE = getTreeEntry(S.OpValue);
12542
+ if (VE && CheckSameVE(VE))
12543
+ return VE;
12544
+ auto It = MultiNodeScalars.find(S.OpValue);
12545
+ if (It != MultiNodeScalars.end()) {
12546
+ auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12547
+ return TE != VE && CheckSameVE(TE);
12548
+ });
12549
+ if (I != It->getSecond().end())
12550
+ return *I;
12551
+ }
12552
+ return nullptr;
12553
+ }
12554
+
12555
+ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12556
+ bool PostponedPHIs) {
12557
+ ValueList &VL = E->getOperand(NodeIdx);
12558
+ const unsigned VF = VL.size();
12559
+ if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
12560
+ auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12561
+ // V may be affected by MinBWs.
12562
+ // We want ShuffleInstructionBuilder to correctly support REVEC. The key
12563
+ // factor is the number of elements, not their type.
12564
+ Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
12565
+ unsigned NumElements = getNumElements(VL.front()->getType());
12566
+ ShuffleInstructionBuilder ShuffleBuilder(
12567
+ NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
12568
+ : ScalarTy,
12569
+ Builder, *this);
12570
+ ShuffleBuilder.add(V, Mask);
12571
+ SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
12572
+ E->CombinedEntriesWithIndices.size());
12573
+ transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
12574
+ [&](const auto &P) {
12575
+ return std::make_pair(VectorizableTree[P.first].get(),
12576
+ P.second);
12577
+ });
12578
+ return ShuffleBuilder.finalize(std::nullopt, SubVectors);
12547
12579
};
12548
- TreeEntry *VE = getTreeEntry(S.OpValue);
12549
- bool IsSameVE = VE && CheckSameVE(VE);
12550
- if (!IsSameVE) {
12551
- auto It = MultiNodeScalars.find(S.OpValue);
12552
- if (It != MultiNodeScalars.end()) {
12553
- auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12554
- return TE != VE && CheckSameVE(TE);
12555
- });
12556
- if (I != It->getSecond().end()) {
12557
- VE = *I;
12558
- IsSameVE = true;
12559
- }
12560
- }
12561
- }
12562
- if (IsSameVE) {
12563
- auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12564
- // V may be affected by MinBWs.
12565
- // We want ShuffleInstructionBuilder to correctly support REVEC. The key
12566
- // factor is the number of elements, not their type.
12567
- Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
12568
- unsigned NumElements = getNumElements(VL.front()->getType());
12569
- ShuffleInstructionBuilder ShuffleBuilder(
12570
- NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
12571
- : ScalarTy,
12572
- Builder, *this);
12573
- ShuffleBuilder.add(V, Mask);
12574
- SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
12575
- E->CombinedEntriesWithIndices.size());
12576
- transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
12577
- [&](const auto &P) {
12578
- return std::make_pair(VectorizableTree[P.first].get(),
12579
- P.second);
12580
- });
12581
- return ShuffleBuilder.finalize(std::nullopt, SubVectors);
12582
- };
12583
- Value *V = vectorizeTree(VE, PostponedPHIs);
12584
- if (VF * getNumElements(VL[0]->getType()) !=
12585
- cast<FixedVectorType>(V->getType())->getNumElements()) {
12586
- if (!VE->ReuseShuffleIndices.empty()) {
12587
- // Reshuffle to get only unique values.
12588
- // If some of the scalars are duplicated in the vectorization
12589
- // tree entry, we do not vectorize them but instead generate a
12590
- // mask for the reuses. But if there are several users of the
12591
- // same entry, they may have different vectorization factors.
12592
- // This is especially important for PHI nodes. In this case, we
12593
- // need to adapt the resulting instruction for the user
12594
- // vectorization factor and have to reshuffle it again to take
12595
- // only unique elements of the vector. Without this code the
12596
- // function incorrectly returns reduced vector instruction with
12597
- // the same elements, not with the unique ones.
12598
-
12599
- // block:
12600
- // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12601
- // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12602
- // ... (use %2)
12603
- // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12604
- // br %block
12605
- SmallVector<int> Mask(VF, PoisonMaskElem);
12606
- for (auto [I, V] : enumerate(VL)) {
12607
- if (isa<PoisonValue>(V))
12608
- continue;
12609
- Mask[I] = VE->findLaneForValue(V);
12610
- }
12611
- V = FinalShuffle(V, Mask);
12612
- } else {
12613
- assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12614
- "Expected vectorization factor less "
12615
- "than original vector size.");
12616
- SmallVector<int> UniformMask(VF, 0);
12617
- std::iota(UniformMask.begin(), UniformMask.end(), 0);
12618
- V = FinalShuffle(V, UniformMask);
12580
+ Value *V = vectorizeTree(VE, PostponedPHIs);
12581
+ if (VF * getNumElements(VL[0]->getType()) !=
12582
+ cast<FixedVectorType>(V->getType())->getNumElements()) {
12583
+ if (!VE->ReuseShuffleIndices.empty()) {
12584
+ // Reshuffle to get only unique values.
12585
+ // If some of the scalars are duplicated in the vectorization
12586
+ // tree entry, we do not vectorize them but instead generate a
12587
+ // mask for the reuses. But if there are several users of the
12588
+ // same entry, they may have different vectorization factors.
12589
+ // This is especially important for PHI nodes. In this case, we
12590
+ // need to adapt the resulting instruction for the user
12591
+ // vectorization factor and have to reshuffle it again to take
12592
+ // only unique elements of the vector. Without this code the
12593
+ // function incorrectly returns reduced vector instruction with
12594
+ // the same elements, not with the unique ones.
12595
+
12596
+ // block:
12597
+ // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12598
+ // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12599
+ // ... (use %2)
12600
+ // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12601
+ // br %block
12602
+ SmallVector<int> Mask(VF, PoisonMaskElem);
12603
+ for (auto [I, V] : enumerate(VL)) {
12604
+ if (isa<PoisonValue>(V))
12605
+ continue;
12606
+ Mask[I] = VE->findLaneForValue(V);
12619
12607
}
12620
- }
12621
- // Need to update the operand gather node, if actually the operand is not a
12622
- // vectorized node, but the buildvector/gather node, which matches one of
12623
- // the vectorized nodes.
12624
- if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12625
- return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12626
- }) == VE->UserTreeIndices.end()) {
12627
- auto *It = find_if(
12628
- VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12629
- return TE->isGather() &&
12630
- TE->UserTreeIndices.front().UserTE == E &&
12631
- TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12632
- });
12633
- assert(It != VectorizableTree.end() && "Expected gather node operand.");
12634
- (*It)->VectorizedValue = V;
12635
- }
12636
- return V;
12608
+ V = FinalShuffle(V, Mask);
12609
+ } else {
12610
+ assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12611
+ "Expected vectorization factor less "
12612
+ "than original vector size.");
12613
+ SmallVector<int> UniformMask(VF, 0);
12614
+ std::iota(UniformMask.begin(), UniformMask.end(), 0);
12615
+ V = FinalShuffle(V, UniformMask);
12616
+ }
12617
+ }
12618
+ // Need to update the operand gather node, if actually the operand is not a
12619
+ // vectorized node, but the buildvector/gather node, which matches one of
12620
+ // the vectorized nodes.
12621
+ if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12622
+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12623
+ }) == VE->UserTreeIndices.end()) {
12624
+ auto *It =
12625
+ find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12626
+ return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
12627
+ TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12628
+ });
12629
+ assert(It != VectorizableTree.end() && "Expected gather node operand.");
12630
+ (*It)->VectorizedValue = V;
12637
12631
}
12632
+ return V;
12638
12633
}
12639
12634
12640
12635
// Find the corresponding gather entry and vectorize it.
0 commit comments