@@ -6954,6 +6954,82 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
6954
6954
return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
6955
6955
}
6956
6956
6957
+ /// Calculate the scalar and the vector costs from vectorizing set of GEPs.
6958
+ static std::pair<InstructionCost, InstructionCost>
6959
+ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6960
+ Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6961
+ Type *ScalarTy, VectorType *VecTy) {
6962
+ InstructionCost ScalarCost = 0;
6963
+ InstructionCost VecCost = 0;
6964
+ // Here we differentiate two cases: (1) when Ptrs represent a regular
6965
+ // vectorization tree node (as they are pointer arguments of scattered
6966
+ // loads) or (2) when Ptrs are the arguments of loads or stores being
6967
+ // vectorized as plane wide unit-stride load/store since all the
6968
+ // loads/stores are known to be from/to adjacent locations.
6969
+ if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
6970
+ // Case 2: estimate costs for pointer related costs when vectorizing to
6971
+ // a wide load/store.
6972
+ // Scalar cost is estimated as a set of pointers with known relationship
6973
+ // between them.
6974
+ // For vector code we will use BasePtr as argument for the wide load/store
6975
+ // but we also need to account all the instructions which are going to
6976
+ // stay in vectorized code due to uses outside of these scalar
6977
+ // loads/stores.
6978
+ ScalarCost = TTI.getPointersChainCost(
6979
+ Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
6980
+ CostKind);
6981
+
6982
+ SmallVector<const Value *> PtrsRetainedInVecCode;
6983
+ for (Value *V : Ptrs) {
6984
+ if (V == BasePtr) {
6985
+ PtrsRetainedInVecCode.push_back(V);
6986
+ continue;
6987
+ }
6988
+ auto *Ptr = dyn_cast<GetElementPtrInst>(V);
6989
+ // For simplicity assume Ptr to stay in vectorized code if it's not a
6990
+ // GEP instruction. We don't care since it's cost considered free.
6991
+ // TODO: We should check for any uses outside of vectorizable tree
6992
+ // rather than just single use.
6993
+ if (!Ptr || !Ptr->hasOneUse())
6994
+ PtrsRetainedInVecCode.push_back(V);
6995
+ }
6996
+
6997
+ if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
6998
+ // If all pointers stay in vectorized code then we don't have
6999
+ // any savings on that.
7000
+ return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
7001
+ }
7002
+ VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
7003
+ TTI::PointersChainInfo::getKnownStride(),
7004
+ VecTy, CostKind);
7005
+ } else {
7006
+ // Case 1: Ptrs are the arguments of loads that we are going to transform
7007
+ // into masked gather load intrinsic.
7008
+ // All the scalar GEPs will be removed as a result of vectorization.
7009
+ // For any external uses of some lanes extract element instructions will
7010
+ // be generated (which cost is estimated separately).
7011
+ TTI::PointersChainInfo PtrsInfo =
7012
+ all_of(Ptrs,
7013
+ [](const Value *V) {
7014
+ auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7015
+ return Ptr && !Ptr->hasAllConstantIndices();
7016
+ })
7017
+ ? TTI::PointersChainInfo::getUnknownStride()
7018
+ : TTI::PointersChainInfo::getKnownStride();
7019
+
7020
+ ScalarCost =
7021
+ TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7022
+ if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7023
+ SmallVector<const Value *> Indices(BaseGEP->indices());
7024
+ VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
7025
+ BaseGEP->getPointerOperand(), Indices, VecTy,
7026
+ CostKind);
7027
+ }
7028
+ }
7029
+
7030
+ return std::make_pair(ScalarCost, VecCost);
7031
+ }
7032
+
6957
7033
/// Merges shuffle masks and emits final shuffle instruction, if required. It
6958
7034
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
6959
7035
/// when the actual shuffle instruction is generated only if this is actually
@@ -7917,78 +7993,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7917
7993
// Calculate cost difference from vectorizing set of GEPs.
7918
7994
// Negative value means vectorizing is profitable.
7919
7995
auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
7920
- InstructionCost ScalarCost = 0;
7921
- InstructionCost VecCost = 0;
7922
- // Here we differentiate two cases: (1) when Ptrs represent a regular
7923
- // vectorization tree node (as they are pointer arguments of scattered
7924
- // loads) or (2) when Ptrs are the arguments of loads or stores being
7925
- // vectorized as plane wide unit-stride load/store since all the
7926
- // loads/stores are known to be from/to adjacent locations.
7927
7996
assert(E->State == TreeEntry::Vectorize &&
7928
7997
"Entry state expected to be Vectorize here.");
7929
- if (isa<LoadInst, StoreInst>(VL0)) {
7930
- // Case 2: estimate costs for pointer related costs when vectorizing to
7931
- // a wide load/store.
7932
- // Scalar cost is estimated as a set of pointers with known relationship
7933
- // between them.
7934
- // For vector code we will use BasePtr as argument for the wide load/store
7935
- // but we also need to account all the instructions which are going to
7936
- // stay in vectorized code due to uses outside of these scalar
7937
- // loads/stores.
7938
- ScalarCost = TTI->getPointersChainCost(
7939
- Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7940
- CostKind);
7941
-
7942
- SmallVector<const Value *> PtrsRetainedInVecCode;
7943
- for (Value *V : Ptrs) {
7944
- if (V == BasePtr) {
7945
- PtrsRetainedInVecCode.push_back(V);
7946
- continue;
7947
- }
7948
- auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7949
- // For simplicity assume Ptr to stay in vectorized code if it's not a
7950
- // GEP instruction. We don't care since it's cost considered free.
7951
- // TODO: We should check for any uses outside of vectorizable tree
7952
- // rather than just single use.
7953
- if (!Ptr || !Ptr->hasOneUse())
7954
- PtrsRetainedInVecCode.push_back(V);
7955
- }
7956
-
7957
- if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7958
- // If all pointers stay in vectorized code then we don't have
7959
- // any savings on that.
7960
- LLVM_DEBUG(dumpTreeCosts(E, 0, ScalarCost, ScalarCost,
7961
- "Calculated GEPs cost for Tree"));
7962
- return InstructionCost{TTI::TCC_Free};
7963
- }
7964
- VecCost = TTI->getPointersChainCost(
7965
- PtrsRetainedInVecCode, BasePtr,
7966
- TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind);
7967
- } else {
7968
- // Case 1: Ptrs are the arguments of loads that we are going to transform
7969
- // into masked gather load intrinsic.
7970
- // All the scalar GEPs will be removed as a result of vectorization.
7971
- // For any external uses of some lanes extract element instructions will
7972
- // be generated (which cost is estimated separately).
7973
- TTI::PointersChainInfo PtrsInfo =
7974
- all_of(Ptrs,
7975
- [](const Value *V) {
7976
- auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7977
- return Ptr && !Ptr->hasAllConstantIndices();
7978
- })
7979
- ? TTI::PointersChainInfo::getUnknownStride()
7980
- : TTI::PointersChainInfo::getKnownStride();
7981
-
7982
- ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
7983
- CostKind);
7984
- if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7985
- SmallVector<const Value *> Indices(BaseGEP->indices());
7986
- VecCost = TTI->getGEPCost(BaseGEP->getSourceElementType(),
7987
- BaseGEP->getPointerOperand(), Indices, VecTy,
7988
- CostKind);
7989
- }
7990
- }
7991
-
7998
+ InstructionCost ScalarCost = 0;
7999
+ InstructionCost VecCost = 0;
8000
+ std::tie(ScalarCost, VecCost) = getGEPCosts(
8001
+ *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, ScalarTy, VecTy);
7992
8002
LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
7993
8003
"Calculated GEPs cost for Tree"));
7994
8004
0 commit comments