@@ -7888,19 +7888,18 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7888
7888
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
7889
7889
unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
7890
7890
unsigned VecOpcode;
7891
- auto *SrcVecTy =
7891
+ auto *UserVecTy =
7892
7892
FixedVectorType::get(UserScalarTy, E->getVectorFactor());
7893
7893
if (BWSz > SrcBWSz)
7894
7894
VecOpcode = Instruction::Trunc;
7895
7895
else
7896
7896
VecOpcode =
7897
7897
It->second.second ? Instruction::SExt : Instruction::ZExt;
7898
7898
TTI::CastContextHint CCH = GetCastContextHint(VL0);
7899
- VecCost += TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy , CCH,
7899
+ VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy , CCH,
7900
7900
CostKind);
7901
- ScalarCost +=
7902
- Sz * TTI->getCastInstrCost(VecOpcode, ScalarTy, UserScalarTy,
7903
- CCH, CostKind);
7901
+ ScalarCost += Sz * TTI->getCastInstrCost(VecOpcode, UserScalarTy,
7902
+ ScalarTy, CCH, CostKind);
7904
7903
}
7905
7904
}
7906
7905
}
@@ -8981,7 +8980,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
8981
8980
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
8982
8981
SmallVector<APInt> DemandedElts;
8983
8982
SmallDenseSet<Value *, 4> UsedInserts;
8984
- DenseSet<Value * > VectorCasts;
8983
+ DenseSet<std::pair<const TreeEntry *, Type *> > VectorCasts;
8985
8984
for (ExternalUser &EU : ExternalUses) {
8986
8985
// We only add extract cost once for the same scalar.
8987
8986
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -9051,11 +9050,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
9051
9050
DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
9052
9051
VecId = FirstUsers.size() - 1;
9053
9052
auto It = MinBWs.find(ScalarTE);
9054
- if (It != MinBWs.end() && VectorCasts.insert(EU.Scalar).second) {
9053
+ if (It != MinBWs.end() &&
9054
+ VectorCasts
9055
+ .insert(std::make_pair(ScalarTE, FTy->getElementType()))
9056
+ .second) {
9055
9057
unsigned BWSz = It->second.second;
9056
- unsigned SrcBWSz = DL->getTypeSizeInBits(FTy->getElementType());
9058
+ unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
9057
9059
unsigned VecOpcode;
9058
- if (BWSz < SrcBWSz )
9060
+ if (DstBWSz < BWSz )
9059
9061
VecOpcode = Instruction::Trunc;
9060
9062
else
9061
9063
VecOpcode =
@@ -9108,17 +9110,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
9108
9110
}
9109
9111
// Add reduced value cost, if resized.
9110
9112
if (!VectorizedVals.empty()) {
9111
- auto BWIt = MinBWs.find(VectorizableTree.front().get());
9113
+ const TreeEntry &Root = *VectorizableTree.front().get();
9114
+ auto BWIt = MinBWs.find(&Root);
9112
9115
if (BWIt != MinBWs.end()) {
9113
- Type *DstTy = VectorizableTree.front()-> Scalars.front()->getType();
9116
+ Type *DstTy = Root. Scalars.front()->getType();
9114
9117
unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
9115
- unsigned Opcode = Instruction::Trunc;
9116
- if (OriginalSz < BWIt->second.first)
9117
- Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
9118
- Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);
9119
- Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
9120
- TTI::CastContextHint::None,
9121
- TTI::TCK_RecipThroughput);
9118
+ if (OriginalSz != BWIt->second.first) {
9119
+ unsigned Opcode = Instruction::Trunc;
9120
+ if (OriginalSz < BWIt->second.first)
9121
+ Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
9122
+ Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);
9123
+ Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
9124
+ TTI::CastContextHint::None,
9125
+ TTI::TCK_RecipThroughput);
9126
+ }
9122
9127
}
9123
9128
}
9124
9129
@@ -11419,9 +11424,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
11419
11424
VecOpcode = Instruction::BitCast;
11420
11425
} else if (BWSz < SrcBWSz) {
11421
11426
VecOpcode = Instruction::Trunc;
11422
- } else if (It != MinBWs.end()) {
11427
+ } else if (SrcIt != MinBWs.end()) {
11423
11428
assert(BWSz > SrcBWSz && "Invalid cast!");
11424
- VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11429
+ VecOpcode =
11430
+ SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11425
11431
}
11426
11432
}
11427
11433
Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
@@ -11929,7 +11935,7 @@ Value *BoUpSLP::vectorizeTree(
11929
11935
// basic block. Only one extractelement per block should be emitted.
11930
11936
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
11931
11937
SmallDenseSet<Value *, 4> UsedInserts;
11932
- DenseMap<Value *, Value *> VectorCasts;
11938
+ DenseMap<std::pair< Value *, Type *> , Value *> VectorCasts;
11933
11939
SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
11934
11940
// Extract all of the elements with the external uses.
11935
11941
for (const auto &ExternalUse : ExternalUses) {
@@ -12050,18 +12056,20 @@ Value *BoUpSLP::vectorizeTree(
12050
12056
// Need to use original vector, if the root is truncated.
12051
12057
auto BWIt = MinBWs.find(E);
12052
12058
if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
12053
- auto VecIt = VectorCasts.find(Scalar);
12059
+ auto *ScalarTy = FTy->getElementType();
12060
+ auto Key = std::make_pair(Vec, ScalarTy);
12061
+ auto VecIt = VectorCasts.find(Key);
12054
12062
if (VecIt == VectorCasts.end()) {
12055
12063
IRBuilder<>::InsertPointGuard Guard(Builder);
12056
12064
if (auto *IVec = dyn_cast<Instruction>(Vec))
12057
12065
Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
12058
12066
Vec = Builder.CreateIntCast(
12059
12067
Vec,
12060
12068
FixedVectorType::get(
12061
- cast<VectorType>(VU->getType())->getElementType() ,
12069
+ ScalarTy ,
12062
12070
cast<FixedVectorType>(Vec->getType())->getNumElements()),
12063
12071
BWIt->second.second);
12064
- VectorCasts.try_emplace(Scalar , Vec);
12072
+ VectorCasts.try_emplace(Key , Vec);
12065
12073
} else {
12066
12074
Vec = VecIt->second;
12067
12075
}
0 commit comments