-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[SLP]Improved reduction cost/codegen #118293
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Improved reduction cost/codegen #118293
Conversation
Created using spr 1.3.5
Your org has enabled the Graphite merge queue for merging into mainAdd the label “FP Bundles” to the PR and Graphite will automatically add it to the merge queue when it’s ready to merge. You must have a Graphite account and log in to Graphite in order to use the merge queue. Sign up using this link. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-analysis Author: Alexey Bataev (alexey-bataev) ChangesAVX512, -O3+LTO Program size..text Benchmarks/Shootout-C++ - same transformed reduction RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same
Patch is 22.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118293.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 985ca1532e0149..f2f0e56a3f2014 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1584,6 +1584,10 @@ class TargetTransformInfo {
/// split during legalization. Zero is returned when the answer is unknown.
unsigned getNumberOfParts(Type *Tp) const;
+ /// \return true if \p Tp represent a type, fully occupying whole register,
+ /// false otherwise.
+ bool isFullSingleRegisterType(Type *Tp) const;
+
/// \returns The cost of the address computation. For most targets this can be
/// merged into the instruction indexing mode. Some targets might want to
/// distinguish between address computation for memory operations on vector
@@ -2196,6 +2200,7 @@ class TargetTransformInfo::Concept {
ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
+ virtual bool isFullSingleRegisterType(Type *Tp) const = 0;
virtual InstructionCost
getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
virtual InstructionCost
@@ -2930,6 +2935,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getNumberOfParts(Type *Tp) override {
return Impl.getNumberOfParts(Tp);
}
+ bool isFullSingleRegisterType(Type *Tp) const override {
+ return Impl.isFullSingleRegisterType(Tp);
+ }
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
const SCEV *Ptr) override {
return Impl.getAddressComputationCost(Ty, SE, Ptr);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 38aba183f6a173..ce6a96ea317ba7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -833,6 +833,7 @@ class TargetTransformInfoImplBase {
// Assume that we have a register of the right size for the type.
unsigned getNumberOfParts(Type *Tp) const { return 1; }
+ bool isFullSingleRegisterType(Type *Tp) const { return false; }
InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *,
const SCEV *) const {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 98cbb4886642bf..9e7ce48f901dc5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2612,6 +2612,22 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return *LT.first.getValue();
}
+ bool isFullSingleRegisterType(Type *Tp) const {
+ std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+ if (!LT.first.isValid() || LT.first > 1)
+ return false;
+
+ if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
+ Tp && LT.second.isFixedLengthVector()) {
+ // Check if the n x i1 fits fully into largest integer.
+ if (unsigned VF = LT.second.getVectorNumElements();
+ LT.second.getVectorElementType() == MVT::i1)
+ return DL.isLegalInteger(VF) && !DL.isLegalInteger(VF * 2);
+ return FTp == EVT(LT.second).getTypeForEVT(Tp->getContext());
+ }
+ return false;
+ }
+
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
const SCEV *) {
return 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1fb2b9836de0cc..f7ad9ed905e3a1 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1171,6 +1171,10 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
return TTIImpl->getNumberOfParts(Tp);
}
+bool TargetTransformInfo::isFullSingleRegisterType(Type *Tp) const {
+ return TTIImpl->isFullSingleRegisterType(Tp);
+}
+
InstructionCost
TargetTransformInfo::getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
const SCEV *Ptr) const {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7723442bc0fb6e..5df21b77643746 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12080,7 +12080,11 @@ bool BoUpSLP::isTreeNotExtendable() const {
TreeEntry &E = *VectorizableTree[Idx];
if (!E.isGather())
continue;
- if (E.getOpcode() && E.getOpcode() != Instruction::Load)
+ if ((E.getOpcode() && E.getOpcode() != Instruction::Load) ||
+ (!E.getOpcode() &&
+ all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||
+ (isa<ExtractElementInst>(E.Scalars.front()) &&
+ getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).getOpcode()))
return false;
if (isSplat(E.Scalars) || allConstant(E.Scalars))
continue;
@@ -19174,6 +19178,9 @@ class HorizontalReduction {
/// Checks if the optimization of original scalar identity operations on
/// matched horizontal reductions is enabled and allowed.
bool IsSupportedHorRdxIdentityOp = false;
+ /// Contains vector values for reduction including their scale factor and
+ /// signedness.
+ SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
static bool isCmpSelMinMax(Instruction *I) {
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
@@ -19225,17 +19232,22 @@ class HorizontalReduction {
static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
Value *RHS, const Twine &Name, bool UseSelect) {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
+ Type *OpTy = LHS->getType();
+ assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
switch (Kind) {
case RecurKind::Or:
- if (UseSelect &&
- LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
- return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
+ if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
+ return Builder.CreateSelect(
+ LHS,
+ ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
+ RHS, Name);
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
case RecurKind::And:
- if (UseSelect &&
- LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
- return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
+ if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
+ return Builder.CreateSelect(
+ LHS, RHS,
+ ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
case RecurKind::Add:
@@ -20108,12 +20120,11 @@ class HorizontalReduction {
SameValuesCounter, TrackedToOrig);
}
- Value *ReducedSubTree;
Type *ScalarTy = VL.front()->getType();
if (isa<FixedVectorType>(ScalarTy)) {
assert(SLPReVec && "FixedVectorType is not expected.");
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
- ReducedSubTree = PoisonValue::get(FixedVectorType::get(
+ Value *ReducedSubTree = PoisonValue::get(getWidenedType(
VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
// Do reduction for each lane.
@@ -20131,30 +20142,32 @@ class HorizontalReduction {
SmallVector<int, 16> Mask =
createStrideMask(I, ScalarTyNumElements, VL.size());
Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
- ReducedSubTree = Builder.CreateInsertElement(
- ReducedSubTree,
- emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
+ Value *Val =
+ createSingleOp(Builder, *TTI, Lane,
+ OptReusedScalars && SameScaleFactor
+ ? SameValuesCounter.front().second
+ : 1,
+ Lane->getType()->getScalarType() !=
+ VL.front()->getType()->getScalarType()
+ ? V.isSignedMinBitwidthRootNode()
+ : true, RdxRootInst->getType());
+ ReducedSubTree =
+ Builder.CreateInsertElement(ReducedSubTree, Val, I);
}
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
} else {
- ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
- RdxRootInst->getType());
+ Type *VecTy = VectorizedRoot->getType();
+ Type *RedScalarTy = VecTy->getScalarType();
+ VectorValuesAndScales.emplace_back(
+ VectorizedRoot,
+ OptReusedScalars && SameScaleFactor
+ ? SameValuesCounter.front().second
+ : 1,
+ RedScalarTy != ScalarTy->getScalarType()
+ ? V.isSignedMinBitwidthRootNode()
+ : true);
}
- if (ReducedSubTree->getType() != VL.front()->getType()) {
- assert(ReducedSubTree->getType() != VL.front()->getType() &&
- "Expected different reduction type.");
- ReducedSubTree =
- Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
- V.isSignedMinBitwidthRootNode());
- }
-
- // Improved analysis for add/fadd/xor reductions with same scale factor
- // for all operands of reductions. We can emit scalar ops for them
- // instead.
- if (OptReusedScalars && SameScaleFactor)
- ReducedSubTree = emitScaleForReusedOps(
- ReducedSubTree, Builder, SameValuesCounter.front().second);
- VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
// Count vectorized reduced values to exclude them from final reduction.
for (Value *RdxVal : VL) {
Value *OrigV = TrackedToOrig.at(RdxVal);
@@ -20183,6 +20196,10 @@ class HorizontalReduction {
continue;
}
}
+ if (!VectorValuesAndScales.empty())
+ VectorizedTree = GetNewVectorizedTree(
+ VectorizedTree,
+ emitReduction(Builder, *TTI, ReductionRoot->getType()));
if (VectorizedTree) {
// Reorder operands of bool logical op in the natural order to avoid
// possible problem with poison propagation. If not possible to reorder
@@ -20317,6 +20334,28 @@ class HorizontalReduction {
}
private:
+ /// Checks if the given type \p Ty is a vector type, which does not occupy the
+ /// whole vector register or is expensive for extraction.
+ static bool isNotFullVectorType(const TargetTransformInfo &TTI, Type *Ty) {
+ return TTI.getNumberOfParts(Ty) == 1 && !TTI.isFullSingleRegisterType(Ty);
+ }
+
+ /// Creates the reduction from the given \p Vec vector value with the given
+ /// scale \p Scale and signedness \p IsSigned.
+ Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
+ Value *Vec, unsigned Scale, bool IsSigned,
+ Type *DestTy) {
+ Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
+ if (Rdx->getType() != DestTy->getScalarType())
+ Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
+ // Improved analysis for add/fadd/xor reductions with same scale
+ // factor for all operands of reductions. We can emit scalar ops for
+ // them instead.
+ if (Scale > 1)
+ Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
+ return Rdx;
+ }
+
/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,
@@ -20359,6 +20398,22 @@ class HorizontalReduction {
}
return Cost;
};
+ // Require reduction cost if:
+ // 1. This type is not a full register type and no other vectors with the
+ // same type in the storage (first vector with small type).
+ // 2. The storage does not have any vector with full vector use (first
+ // vector with full register use).
+ bool DoesRequireReductionOp =
+ !AllConsts &&
+ (VectorValuesAndScales.empty() ||
+ (isNotFullVectorType(*TTI, VectorTy) &&
+ none_of(VectorValuesAndScales,
+ [&](const auto &P) {
+ return std::get<0>(P)->getType() == VectorTy;
+ })) ||
+ all_of(VectorValuesAndScales, [&](const auto &P) {
+ return isNotFullVectorType(*TTI, std::get<0>(P)->getType());
+ }));
switch (RdxKind) {
case RecurKind::Add:
case RecurKind::Mul:
@@ -20382,7 +20437,7 @@ class HorizontalReduction {
VectorCost += TTI->getScalarizationOverhead(
VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
/*Extract*/ false, TTI::TCK_RecipThroughput);
- } else {
+ } else if (DoesRequireReductionOp) {
Type *RedTy = VectorTy->getElementType();
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
std::make_pair(RedTy, true));
@@ -20394,6 +20449,14 @@ class HorizontalReduction {
RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
FMF, CostKind);
}
+ } else {
+ unsigned NumParts = TTI->getNumberOfParts(VectorTy);
+ unsigned RegVF = getPartNumElems(getNumElements(VectorTy), NumParts);
+ VectorCost +=
+ NumParts * TTI->getArithmeticInstrCost(
+ RdxOpcode,
+ getWidenedType(VectorTy->getScalarType(), RegVF),
+ CostKind);
}
}
ScalarCost = EvaluateScalarCost([&]() {
@@ -20410,8 +20473,19 @@ class HorizontalReduction {
case RecurKind::UMax:
case RecurKind::UMin: {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
- if (!AllConsts)
- VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+ if (!AllConsts) {
+ if (DoesRequireReductionOp) {
+ VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+ } else {
+ // Check if the previous reduction already exists and account it as
+ // series of operations + single reduction.
+ unsigned NumParts = TTI->getNumberOfParts(VectorTy);
+ unsigned RegVF = getPartNumElems(getNumElements(VectorTy), NumParts);
+ auto *RegVecTy = getWidenedType(VectorTy->getScalarType(), RegVF);
+ IntrinsicCostAttributes ICA(Id, RegVecTy, {RegVecTy, RegVecTy}, FMF);
+ VectorCost += NumParts * TTI->getIntrinsicInstrCost(ICA, CostKind);
+ }
+ }
ScalarCost = EvaluateScalarCost([&]() {
IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
return TTI->getIntrinsicInstrCost(ICA, CostKind);
@@ -20428,6 +20502,190 @@ class HorizontalReduction {
return VectorCost - ScalarCost;
}
+ /// Splits the values, stored in VectorValuesAndScales, into registers/free
+ /// sub-registers, combines them with the given reduction operation as a
+ /// vector operation and then performs single (small enough) reduction.
+ Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
+ Type *DestTy) {
+ Value *ReducedSubTree = nullptr;
+ // Creates reduction and combines with the previous reduction.
+ auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
+ Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
+ if (ReducedSubTree)
+ ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
+ "op.rdx", ReductionOps);
+ else
+ ReducedSubTree = Rdx;
+ };
+ if (VectorValuesAndScales.size() == 1) {
+ const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
+ CreateSingleOp(Vec, Scale, IsSigned);
+ return ReducedSubTree;
+ }
+ // Splits multivector value into per-register values.
+ auto SplitVector = [&](Value *Vec) {
+ auto *ScalarTy = cast<VectorType>(Vec->getType())->getElementType();
+ unsigned Sz = getNumElements(Vec->getType());
+ unsigned NumParts = TTI.getNumberOfParts(Vec->getType());
+ if (NumParts <= 1 || NumParts >= Sz ||
+ isNotFullVectorType(TTI, Vec->getType()))
+ return SmallVector<Value *>(1, Vec);
+ unsigned RegSize = getPartNumElems(Sz, NumParts);
+ auto *DstTy = getWidenedType(ScalarTy, RegSize);
+ SmallVector<Value *> Regs(NumParts);
+ for (unsigned Part : seq<unsigned>(NumParts))
+ Regs[Part] = Builder.CreateExtractVector(
+ DstTy, Vec, Builder.getInt64(Part * RegSize));
+ return Regs;
+ };
+ SmallMapVector<Type *, Value *, 4> VecOps;
+ // Scales Vec using given Cnt scale factor and then performs vector combine
+ // with previous value of VecOp.
+ auto CreateVecOp = [&](Value *Vec, unsigned Cnt) {
+ Type *ScalarTy = cast<VectorType>(Vec->getType())->getElementType();
+ // Scale Vec using given Cnt scale factor.
+ if (Cnt > 1) {
+ ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
+ switch (RdxKind) {
+ case RecurKind::Add: {
+ if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
+ unsigned VF = getNumElements(Vec->getType());
+ LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
+ << ". (HorRdx)\n");
+ SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
+ for (unsigned I : seq<unsigned>(Cnt))
+ std::iota(std::next(Mask.begin(), VF * I),
+ std::next(Mask.begin(), VF * (I + 1)), 0);
+ ++NumVectorInstructions;
+ Vec = Builder.CreateShuffleVector(Vec, Mask);
+ break;
+ }
+ // res = mul vv, n
+ Value *Scale =
+ ConstantVector::getSplat(EC, ConstantInt::get(ScalarTy, Cnt));
+ LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
+ << ". (HorRdx)\n");
+ ++NumVectorInstructions;
+ Vec = Builder.CreateMul(Vec, Scale);
+ break;
+ }
+ case RecurKind::Xor: {
+ // res = n % 2 ? 0 : vv
+ LLVM_DEBUG(dbgs()
+ << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
+ if (Cnt % 2 == 0)
+ Vec = Constant::getNullValue(Vec->getType());
+ break;
+ }
+ case RecurKind::FAdd: {
+ // res = fmul v, n
+ Value *Scale =
+ ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
+ LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
+ << ". (HorRdx)\n");
+ ++NumVectorInstructions;
+ Vec = Builder.CreateFMul(Vec, Scale);
+ break;
+ }
+ case RecurKind::And:
+ case RecurKind::Or:
+ case RecurKind::SMax:
+ case RecurKind::SMin:
+ case RecurKind::UMax:
+ case RecurKind::UMin:
+ case RecurKind::FMax:
+ case RecurKind::FMin:
+ case RecurKind::FMaximum:
+ case RecurKind::FMinimum:
+ // res = vv
+ break;
+ case RecurKind::Mul:
+ case RecurKind::FMul:
+ case RecurKind::FMulAdd:
+ case RecurKind::IAnyOf:
+ case RecurKind::FAnyOf:
+ case RecurKind::None:
+ llvm_unreachable("Unexpected reduction kind for repeated scalar.");
+ }
+ }
+ // Combine Vec w...
[truncated]
|
@llvm/pr-subscribers-vectorizers Author: Alexey Bataev (alexey-bataev) ChangesAVX512, -O3+LTO Program size..text Benchmarks/Shootout-C++ - same transformed reduction RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same
Patch is 22.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118293.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 985ca1532e0149..f2f0e56a3f2014 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1584,6 +1584,10 @@ class TargetTransformInfo {
/// split during legalization. Zero is returned when the answer is unknown.
unsigned getNumberOfParts(Type *Tp) const;
+ /// \return true if \p Tp represent a type, fully occupying whole register,
+ /// false otherwise.
+ bool isFullSingleRegisterType(Type *Tp) const;
+
/// \returns The cost of the address computation. For most targets this can be
/// merged into the instruction indexing mode. Some targets might want to
/// distinguish between address computation for memory operations on vector
@@ -2196,6 +2200,7 @@ class TargetTransformInfo::Concept {
ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
+ virtual bool isFullSingleRegisterType(Type *Tp) const = 0;
virtual InstructionCost
getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
virtual InstructionCost
@@ -2930,6 +2935,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getNumberOfParts(Type *Tp) override {
return Impl.getNumberOfParts(Tp);
}
+ bool isFullSingleRegisterType(Type *Tp) const override {
+ return Impl.isFullSingleRegisterType(Tp);
+ }
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
const SCEV *Ptr) override {
return Impl.getAddressComputationCost(Ty, SE, Ptr);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 38aba183f6a173..ce6a96ea317ba7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -833,6 +833,7 @@ class TargetTransformInfoImplBase {
// Assume that we have a register of the right size for the type.
unsigned getNumberOfParts(Type *Tp) const { return 1; }
+ bool isFullSingleRegisterType(Type *Tp) const { return false; }
InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *,
const SCEV *) const {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 98cbb4886642bf..9e7ce48f901dc5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2612,6 +2612,22 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return *LT.first.getValue();
}
+ bool isFullSingleRegisterType(Type *Tp) const {
+ std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+ if (!LT.first.isValid() || LT.first > 1)
+ return false;
+
+ if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
+ Tp && LT.second.isFixedLengthVector()) {
+ // Check if the n x i1 fits fully into largest integer.
+ if (unsigned VF = LT.second.getVectorNumElements();
+ LT.second.getVectorElementType() == MVT::i1)
+ return DL.isLegalInteger(VF) && !DL.isLegalInteger(VF * 2);
+ return FTp == EVT(LT.second).getTypeForEVT(Tp->getContext());
+ }
+ return false;
+ }
+
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
const SCEV *) {
return 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1fb2b9836de0cc..f7ad9ed905e3a1 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1171,6 +1171,10 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
return TTIImpl->getNumberOfParts(Tp);
}
+bool TargetTransformInfo::isFullSingleRegisterType(Type *Tp) const {
+ return TTIImpl->isFullSingleRegisterType(Tp);
+}
+
InstructionCost
TargetTransformInfo::getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
const SCEV *Ptr) const {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7723442bc0fb6e..5df21b77643746 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12080,7 +12080,11 @@ bool BoUpSLP::isTreeNotExtendable() const {
TreeEntry &E = *VectorizableTree[Idx];
if (!E.isGather())
continue;
- if (E.getOpcode() && E.getOpcode() != Instruction::Load)
+ if ((E.getOpcode() && E.getOpcode() != Instruction::Load) ||
+ (!E.getOpcode() &&
+ all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||
+ (isa<ExtractElementInst>(E.Scalars.front()) &&
+ getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).getOpcode()))
return false;
if (isSplat(E.Scalars) || allConstant(E.Scalars))
continue;
@@ -19174,6 +19178,9 @@ class HorizontalReduction {
/// Checks if the optimization of original scalar identity operations on
/// matched horizontal reductions is enabled and allowed.
bool IsSupportedHorRdxIdentityOp = false;
+ /// Contains vector values for reduction including their scale factor and
+ /// signedness.
+ SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
static bool isCmpSelMinMax(Instruction *I) {
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
@@ -19225,17 +19232,22 @@ class HorizontalReduction {
static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
Value *RHS, const Twine &Name, bool UseSelect) {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
+ Type *OpTy = LHS->getType();
+ assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
switch (Kind) {
case RecurKind::Or:
- if (UseSelect &&
- LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
- return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
+ if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
+ return Builder.CreateSelect(
+ LHS,
+ ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
+ RHS, Name);
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
case RecurKind::And:
- if (UseSelect &&
- LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
- return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
+ if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
+ return Builder.CreateSelect(
+ LHS, RHS,
+ ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
case RecurKind::Add:
@@ -20108,12 +20120,11 @@ class HorizontalReduction {
SameValuesCounter, TrackedToOrig);
}
- Value *ReducedSubTree;
Type *ScalarTy = VL.front()->getType();
if (isa<FixedVectorType>(ScalarTy)) {
assert(SLPReVec && "FixedVectorType is not expected.");
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
- ReducedSubTree = PoisonValue::get(FixedVectorType::get(
+ Value *ReducedSubTree = PoisonValue::get(getWidenedType(
VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
// Do reduction for each lane.
@@ -20131,30 +20142,32 @@ class HorizontalReduction {
SmallVector<int, 16> Mask =
createStrideMask(I, ScalarTyNumElements, VL.size());
Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
- ReducedSubTree = Builder.CreateInsertElement(
- ReducedSubTree,
- emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
+ Value *Val =
+ createSingleOp(Builder, *TTI, Lane,
+ OptReusedScalars && SameScaleFactor
+ ? SameValuesCounter.front().second
+ : 1,
+ Lane->getType()->getScalarType() !=
+ VL.front()->getType()->getScalarType()
+ ? V.isSignedMinBitwidthRootNode()
+ : true, RdxRootInst->getType());
+ ReducedSubTree =
+ Builder.CreateInsertElement(ReducedSubTree, Val, I);
}
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
} else {
- ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
- RdxRootInst->getType());
+ Type *VecTy = VectorizedRoot->getType();
+ Type *RedScalarTy = VecTy->getScalarType();
+ VectorValuesAndScales.emplace_back(
+ VectorizedRoot,
+ OptReusedScalars && SameScaleFactor
+ ? SameValuesCounter.front().second
+ : 1,
+ RedScalarTy != ScalarTy->getScalarType()
+ ? V.isSignedMinBitwidthRootNode()
+ : true);
}
- if (ReducedSubTree->getType() != VL.front()->getType()) {
- assert(ReducedSubTree->getType() != VL.front()->getType() &&
- "Expected different reduction type.");
- ReducedSubTree =
- Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
- V.isSignedMinBitwidthRootNode());
- }
-
- // Improved analysis for add/fadd/xor reductions with same scale factor
- // for all operands of reductions. We can emit scalar ops for them
- // instead.
- if (OptReusedScalars && SameScaleFactor)
- ReducedSubTree = emitScaleForReusedOps(
- ReducedSubTree, Builder, SameValuesCounter.front().second);
- VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
// Count vectorized reduced values to exclude them from final reduction.
for (Value *RdxVal : VL) {
Value *OrigV = TrackedToOrig.at(RdxVal);
@@ -20183,6 +20196,10 @@ class HorizontalReduction {
continue;
}
}
+ if (!VectorValuesAndScales.empty())
+ VectorizedTree = GetNewVectorizedTree(
+ VectorizedTree,
+ emitReduction(Builder, *TTI, ReductionRoot->getType()));
if (VectorizedTree) {
// Reorder operands of bool logical op in the natural order to avoid
// possible problem with poison propagation. If not possible to reorder
@@ -20317,6 +20334,28 @@ class HorizontalReduction {
}
private:
+ /// Checks if the given type \p Ty is a vector type, which does not occupy the
+ /// whole vector register or is expensive for extraction.
+ static bool isNotFullVectorType(const TargetTransformInfo &TTI, Type *Ty) {
+ return TTI.getNumberOfParts(Ty) == 1 && !TTI.isFullSingleRegisterType(Ty);
+ }
+
+ /// Creates the reduction from the given \p Vec vector value with the given
+ /// scale \p Scale and signedness \p IsSigned.
+ Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
+ Value *Vec, unsigned Scale, bool IsSigned,
+ Type *DestTy) {
+ Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
+ if (Rdx->getType() != DestTy->getScalarType())
+ Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
+ // Improved analysis for add/fadd/xor reductions with same scale
+ // factor for all operands of reductions. We can emit scalar ops for
+ // them instead.
+ if (Scale > 1)
+ Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
+ return Rdx;
+ }
+
/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,
@@ -20359,6 +20398,22 @@ class HorizontalReduction {
}
return Cost;
};
+ // Require reduction cost if:
+ // 1. This type is not a full register type and no other vectors with the
+ // same type in the storage (first vector with small type).
+ // 2. The storage does not have any vector with full vector use (first
+ // vector with full register use).
+ bool DoesRequireReductionOp =
+ !AllConsts &&
+ (VectorValuesAndScales.empty() ||
+ (isNotFullVectorType(*TTI, VectorTy) &&
+ none_of(VectorValuesAndScales,
+ [&](const auto &P) {
+ return std::get<0>(P)->getType() == VectorTy;
+ })) ||
+ all_of(VectorValuesAndScales, [&](const auto &P) {
+ return isNotFullVectorType(*TTI, std::get<0>(P)->getType());
+ }));
switch (RdxKind) {
case RecurKind::Add:
case RecurKind::Mul:
@@ -20382,7 +20437,7 @@ class HorizontalReduction {
VectorCost += TTI->getScalarizationOverhead(
VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
/*Extract*/ false, TTI::TCK_RecipThroughput);
- } else {
+ } else if (DoesRequireReductionOp) {
Type *RedTy = VectorTy->getElementType();
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
std::make_pair(RedTy, true));
@@ -20394,6 +20449,14 @@ class HorizontalReduction {
RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
FMF, CostKind);
}
+ } else {
+ unsigned NumParts = TTI->getNumberOfParts(VectorTy);
+ unsigned RegVF = getPartNumElems(getNumElements(VectorTy), NumParts);
+ VectorCost +=
+ NumParts * TTI->getArithmeticInstrCost(
+ RdxOpcode,
+ getWidenedType(VectorTy->getScalarType(), RegVF),
+ CostKind);
}
}
ScalarCost = EvaluateScalarCost([&]() {
@@ -20410,8 +20473,19 @@ class HorizontalReduction {
case RecurKind::UMax:
case RecurKind::UMin: {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
- if (!AllConsts)
- VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+ if (!AllConsts) {
+ if (DoesRequireReductionOp) {
+ VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+ } else {
+ // Check if the previous reduction already exists and account it as
+ // series of operations + single reduction.
+ unsigned NumParts = TTI->getNumberOfParts(VectorTy);
+ unsigned RegVF = getPartNumElems(getNumElements(VectorTy), NumParts);
+ auto *RegVecTy = getWidenedType(VectorTy->getScalarType(), RegVF);
+ IntrinsicCostAttributes ICA(Id, RegVecTy, {RegVecTy, RegVecTy}, FMF);
+ VectorCost += NumParts * TTI->getIntrinsicInstrCost(ICA, CostKind);
+ }
+ }
ScalarCost = EvaluateScalarCost([&]() {
IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
return TTI->getIntrinsicInstrCost(ICA, CostKind);
@@ -20428,6 +20502,190 @@ class HorizontalReduction {
return VectorCost - ScalarCost;
}
+ /// Splits the values, stored in VectorValuesAndScales, into registers/free
+ /// sub-registers, combines them with the given reduction operation as a
+ /// vector operation and then performs single (small enough) reduction.
+ Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
+ Type *DestTy) {
+ Value *ReducedSubTree = nullptr;
+ // Creates reduction and combines with the previous reduction.
+ auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
+ Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
+ if (ReducedSubTree)
+ ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
+ "op.rdx", ReductionOps);
+ else
+ ReducedSubTree = Rdx;
+ };
+ if (VectorValuesAndScales.size() == 1) {
+ const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
+ CreateSingleOp(Vec, Scale, IsSigned);
+ return ReducedSubTree;
+ }
+ // Splits multivector value into per-register values.
+ auto SplitVector = [&](Value *Vec) {
+ auto *ScalarTy = cast<VectorType>(Vec->getType())->getElementType();
+ unsigned Sz = getNumElements(Vec->getType());
+ unsigned NumParts = TTI.getNumberOfParts(Vec->getType());
+ if (NumParts <= 1 || NumParts >= Sz ||
+ isNotFullVectorType(TTI, Vec->getType()))
+ return SmallVector<Value *>(1, Vec);
+ unsigned RegSize = getPartNumElems(Sz, NumParts);
+ auto *DstTy = getWidenedType(ScalarTy, RegSize);
+ SmallVector<Value *> Regs(NumParts);
+ for (unsigned Part : seq<unsigned>(NumParts))
+ Regs[Part] = Builder.CreateExtractVector(
+ DstTy, Vec, Builder.getInt64(Part * RegSize));
+ return Regs;
+ };
+ SmallMapVector<Type *, Value *, 4> VecOps;
+ // Scales Vec using given Cnt scale factor and then performs vector combine
+ // with previous value of VecOp.
+ auto CreateVecOp = [&](Value *Vec, unsigned Cnt) {
+ Type *ScalarTy = cast<VectorType>(Vec->getType())->getElementType();
+ // Scale Vec using given Cnt scale factor.
+ if (Cnt > 1) {
+ ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
+ switch (RdxKind) {
+ case RecurKind::Add: {
+ if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
+ unsigned VF = getNumElements(Vec->getType());
+ LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
+ << ". (HorRdx)\n");
+ SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
+ for (unsigned I : seq<unsigned>(Cnt))
+ std::iota(std::next(Mask.begin(), VF * I),
+ std::next(Mask.begin(), VF * (I + 1)), 0);
+ ++NumVectorInstructions;
+ Vec = Builder.CreateShuffleVector(Vec, Mask);
+ break;
+ }
+ // res = mul vv, n
+ Value *Scale =
+ ConstantVector::getSplat(EC, ConstantInt::get(ScalarTy, Cnt));
+ LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
+ << ". (HorRdx)\n");
+ ++NumVectorInstructions;
+ Vec = Builder.CreateMul(Vec, Scale);
+ break;
+ }
+ case RecurKind::Xor: {
+ // res = n % 2 ? 0 : vv
+ LLVM_DEBUG(dbgs()
+ << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
+ if (Cnt % 2 == 0)
+ Vec = Constant::getNullValue(Vec->getType());
+ break;
+ }
+ case RecurKind::FAdd: {
+ // res = fmul v, n
+ Value *Scale =
+ ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
+ LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
+ << ". (HorRdx)\n");
+ ++NumVectorInstructions;
+ Vec = Builder.CreateFMul(Vec, Scale);
+ break;
+ }
+ case RecurKind::And:
+ case RecurKind::Or:
+ case RecurKind::SMax:
+ case RecurKind::SMin:
+ case RecurKind::UMax:
+ case RecurKind::UMin:
+ case RecurKind::FMax:
+ case RecurKind::FMin:
+ case RecurKind::FMaximum:
+ case RecurKind::FMinimum:
+ // res = vv
+ break;
+ case RecurKind::Mul:
+ case RecurKind::FMul:
+ case RecurKind::FMulAdd:
+ case RecurKind::IAnyOf:
+ case RecurKind::FAnyOf:
+ case RecurKind::None:
+ llvm_unreachable("Unexpected reduction kind for repeated scalar.");
+ }
+ }
+ // Combine Vec w...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesAVX512, -O3+LTO Program size..text Benchmarks/Shootout-C++ - same transformed reduction RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same
Patch is 22.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118293.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 985ca1532e0149..f2f0e56a3f2014 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1584,6 +1584,10 @@ class TargetTransformInfo {
/// split during legalization. Zero is returned when the answer is unknown.
unsigned getNumberOfParts(Type *Tp) const;
+ /// \return true if \p Tp represent a type, fully occupying whole register,
+ /// false otherwise.
+ bool isFullSingleRegisterType(Type *Tp) const;
+
/// \returns The cost of the address computation. For most targets this can be
/// merged into the instruction indexing mode. Some targets might want to
/// distinguish between address computation for memory operations on vector
@@ -2196,6 +2200,7 @@ class TargetTransformInfo::Concept {
ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
+ virtual bool isFullSingleRegisterType(Type *Tp) const = 0;
virtual InstructionCost
getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
virtual InstructionCost
@@ -2930,6 +2935,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getNumberOfParts(Type *Tp) override {
return Impl.getNumberOfParts(Tp);
}
+ bool isFullSingleRegisterType(Type *Tp) const override {
+ return Impl.isFullSingleRegisterType(Tp);
+ }
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
const SCEV *Ptr) override {
return Impl.getAddressComputationCost(Ty, SE, Ptr);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 38aba183f6a173..ce6a96ea317ba7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -833,6 +833,7 @@ class TargetTransformInfoImplBase {
// Assume that we have a register of the right size for the type.
unsigned getNumberOfParts(Type *Tp) const { return 1; }
+ bool isFullSingleRegisterType(Type *Tp) const { return false; }
InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *,
const SCEV *) const {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 98cbb4886642bf..9e7ce48f901dc5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2612,6 +2612,22 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return *LT.first.getValue();
}
+ bool isFullSingleRegisterType(Type *Tp) const {
+ std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+ if (!LT.first.isValid() || LT.first > 1)
+ return false;
+
+ if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
+ Tp && LT.second.isFixedLengthVector()) {
+ // Check if the n x i1 fits fully into largest integer.
+ if (unsigned VF = LT.second.getVectorNumElements();
+ LT.second.getVectorElementType() == MVT::i1)
+ return DL.isLegalInteger(VF) && !DL.isLegalInteger(VF * 2);
+ return FTp == EVT(LT.second).getTypeForEVT(Tp->getContext());
+ }
+ return false;
+ }
+
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
const SCEV *) {
return 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1fb2b9836de0cc..f7ad9ed905e3a1 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1171,6 +1171,10 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
return TTIImpl->getNumberOfParts(Tp);
}
+bool TargetTransformInfo::isFullSingleRegisterType(Type *Tp) const {
+ return TTIImpl->isFullSingleRegisterType(Tp);
+}
+
InstructionCost
TargetTransformInfo::getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
const SCEV *Ptr) const {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7723442bc0fb6e..5df21b77643746 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12080,7 +12080,11 @@ bool BoUpSLP::isTreeNotExtendable() const {
TreeEntry &E = *VectorizableTree[Idx];
if (!E.isGather())
continue;
- if (E.getOpcode() && E.getOpcode() != Instruction::Load)
+ if ((E.getOpcode() && E.getOpcode() != Instruction::Load) ||
+ (!E.getOpcode() &&
+ all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||
+ (isa<ExtractElementInst>(E.Scalars.front()) &&
+ getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).getOpcode()))
return false;
if (isSplat(E.Scalars) || allConstant(E.Scalars))
continue;
@@ -19174,6 +19178,9 @@ class HorizontalReduction {
/// Checks if the optimization of original scalar identity operations on
/// matched horizontal reductions is enabled and allowed.
bool IsSupportedHorRdxIdentityOp = false;
+ /// Contains vector values for reduction including their scale factor and
+ /// signedness.
+ SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
static bool isCmpSelMinMax(Instruction *I) {
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
@@ -19225,17 +19232,22 @@ class HorizontalReduction {
static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
Value *RHS, const Twine &Name, bool UseSelect) {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
+ Type *OpTy = LHS->getType();
+ assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
switch (Kind) {
case RecurKind::Or:
- if (UseSelect &&
- LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
- return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
+ if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
+ return Builder.CreateSelect(
+ LHS,
+ ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
+ RHS, Name);
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
case RecurKind::And:
- if (UseSelect &&
- LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
- return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
+ if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
+ return Builder.CreateSelect(
+ LHS, RHS,
+ ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
case RecurKind::Add:
@@ -20108,12 +20120,11 @@ class HorizontalReduction {
SameValuesCounter, TrackedToOrig);
}
- Value *ReducedSubTree;
Type *ScalarTy = VL.front()->getType();
if (isa<FixedVectorType>(ScalarTy)) {
assert(SLPReVec && "FixedVectorType is not expected.");
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
- ReducedSubTree = PoisonValue::get(FixedVectorType::get(
+ Value *ReducedSubTree = PoisonValue::get(getWidenedType(
VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
// Do reduction for each lane.
@@ -20131,30 +20142,32 @@ class HorizontalReduction {
SmallVector<int, 16> Mask =
createStrideMask(I, ScalarTyNumElements, VL.size());
Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
- ReducedSubTree = Builder.CreateInsertElement(
- ReducedSubTree,
- emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
+ Value *Val =
+ createSingleOp(Builder, *TTI, Lane,
+ OptReusedScalars && SameScaleFactor
+ ? SameValuesCounter.front().second
+ : 1,
+ Lane->getType()->getScalarType() !=
+ VL.front()->getType()->getScalarType()
+ ? V.isSignedMinBitwidthRootNode()
+ : true, RdxRootInst->getType());
+ ReducedSubTree =
+ Builder.CreateInsertElement(ReducedSubTree, Val, I);
}
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
} else {
- ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
- RdxRootInst->getType());
+ Type *VecTy = VectorizedRoot->getType();
+ Type *RedScalarTy = VecTy->getScalarType();
+ VectorValuesAndScales.emplace_back(
+ VectorizedRoot,
+ OptReusedScalars && SameScaleFactor
+ ? SameValuesCounter.front().second
+ : 1,
+ RedScalarTy != ScalarTy->getScalarType()
+ ? V.isSignedMinBitwidthRootNode()
+ : true);
}
- if (ReducedSubTree->getType() != VL.front()->getType()) {
- assert(ReducedSubTree->getType() != VL.front()->getType() &&
- "Expected different reduction type.");
- ReducedSubTree =
- Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
- V.isSignedMinBitwidthRootNode());
- }
-
- // Improved analysis for add/fadd/xor reductions with same scale factor
- // for all operands of reductions. We can emit scalar ops for them
- // instead.
- if (OptReusedScalars && SameScaleFactor)
- ReducedSubTree = emitScaleForReusedOps(
- ReducedSubTree, Builder, SameValuesCounter.front().second);
- VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
// Count vectorized reduced values to exclude them from final reduction.
for (Value *RdxVal : VL) {
Value *OrigV = TrackedToOrig.at(RdxVal);
@@ -20183,6 +20196,10 @@ class HorizontalReduction {
continue;
}
}
+ if (!VectorValuesAndScales.empty())
+ VectorizedTree = GetNewVectorizedTree(
+ VectorizedTree,
+ emitReduction(Builder, *TTI, ReductionRoot->getType()));
if (VectorizedTree) {
// Reorder operands of bool logical op in the natural order to avoid
// possible problem with poison propagation. If not possible to reorder
@@ -20317,6 +20334,28 @@ class HorizontalReduction {
}
private:
+ /// Checks if the given type \p Ty is a vector type, which does not occupy the
+ /// whole vector register or is expensive for extraction.
+ static bool isNotFullVectorType(const TargetTransformInfo &TTI, Type *Ty) {
+ return TTI.getNumberOfParts(Ty) == 1 && !TTI.isFullSingleRegisterType(Ty);
+ }
+
+ /// Creates the reduction from the given \p Vec vector value with the given
+ /// scale \p Scale and signedness \p IsSigned.
+ Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
+ Value *Vec, unsigned Scale, bool IsSigned,
+ Type *DestTy) {
+ Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
+ if (Rdx->getType() != DestTy->getScalarType())
+ Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
+ // Improved analysis for add/fadd/xor reductions with same scale
+ // factor for all operands of reductions. We can emit scalar ops for
+ // them instead.
+ if (Scale > 1)
+ Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
+ return Rdx;
+ }
+
/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,
@@ -20359,6 +20398,22 @@ class HorizontalReduction {
}
return Cost;
};
+ // Require reduction cost if:
+ // 1. This type is not a full register type and no other vectors with the
+ // same type in the storage (first vector with small type).
+ // 2. The storage does not have any vector with full vector use (first
+ // vector with full register use).
+ bool DoesRequireReductionOp =
+ !AllConsts &&
+ (VectorValuesAndScales.empty() ||
+ (isNotFullVectorType(*TTI, VectorTy) &&
+ none_of(VectorValuesAndScales,
+ [&](const auto &P) {
+ return std::get<0>(P)->getType() == VectorTy;
+ })) ||
+ all_of(VectorValuesAndScales, [&](const auto &P) {
+ return isNotFullVectorType(*TTI, std::get<0>(P)->getType());
+ }));
switch (RdxKind) {
case RecurKind::Add:
case RecurKind::Mul:
@@ -20382,7 +20437,7 @@ class HorizontalReduction {
VectorCost += TTI->getScalarizationOverhead(
VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
/*Extract*/ false, TTI::TCK_RecipThroughput);
- } else {
+ } else if (DoesRequireReductionOp) {
Type *RedTy = VectorTy->getElementType();
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
std::make_pair(RedTy, true));
@@ -20394,6 +20449,14 @@ class HorizontalReduction {
RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
FMF, CostKind);
}
+ } else {
+ unsigned NumParts = TTI->getNumberOfParts(VectorTy);
+ unsigned RegVF = getPartNumElems(getNumElements(VectorTy), NumParts);
+ VectorCost +=
+ NumParts * TTI->getArithmeticInstrCost(
+ RdxOpcode,
+ getWidenedType(VectorTy->getScalarType(), RegVF),
+ CostKind);
}
}
ScalarCost = EvaluateScalarCost([&]() {
@@ -20410,8 +20473,19 @@ class HorizontalReduction {
case RecurKind::UMax:
case RecurKind::UMin: {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
- if (!AllConsts)
- VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+ if (!AllConsts) {
+ if (DoesRequireReductionOp) {
+ VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+ } else {
+ // Check if the previous reduction already exists and account it as
+ // series of operations + single reduction.
+ unsigned NumParts = TTI->getNumberOfParts(VectorTy);
+ unsigned RegVF = getPartNumElems(getNumElements(VectorTy), NumParts);
+ auto *RegVecTy = getWidenedType(VectorTy->getScalarType(), RegVF);
+ IntrinsicCostAttributes ICA(Id, RegVecTy, {RegVecTy, RegVecTy}, FMF);
+ VectorCost += NumParts * TTI->getIntrinsicInstrCost(ICA, CostKind);
+ }
+ }
ScalarCost = EvaluateScalarCost([&]() {
IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
return TTI->getIntrinsicInstrCost(ICA, CostKind);
@@ -20428,6 +20502,190 @@ class HorizontalReduction {
return VectorCost - ScalarCost;
}
+ /// Splits the values, stored in VectorValuesAndScales, into registers/free
+ /// sub-registers, combines them with the given reduction operation as a
+ /// vector operation and then performs single (small enough) reduction.
+ Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
+ Type *DestTy) {
+ Value *ReducedSubTree = nullptr;
+ // Creates reduction and combines with the previous reduction.
+ auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
+ Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
+ if (ReducedSubTree)
+ ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
+ "op.rdx", ReductionOps);
+ else
+ ReducedSubTree = Rdx;
+ };
+ if (VectorValuesAndScales.size() == 1) {
+ const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
+ CreateSingleOp(Vec, Scale, IsSigned);
+ return ReducedSubTree;
+ }
+ // Splits multivector value into per-register values.
+ auto SplitVector = [&](Value *Vec) {
+ auto *ScalarTy = cast<VectorType>(Vec->getType())->getElementType();
+ unsigned Sz = getNumElements(Vec->getType());
+ unsigned NumParts = TTI.getNumberOfParts(Vec->getType());
+ if (NumParts <= 1 || NumParts >= Sz ||
+ isNotFullVectorType(TTI, Vec->getType()))
+ return SmallVector<Value *>(1, Vec);
+ unsigned RegSize = getPartNumElems(Sz, NumParts);
+ auto *DstTy = getWidenedType(ScalarTy, RegSize);
+ SmallVector<Value *> Regs(NumParts);
+ for (unsigned Part : seq<unsigned>(NumParts))
+ Regs[Part] = Builder.CreateExtractVector(
+ DstTy, Vec, Builder.getInt64(Part * RegSize));
+ return Regs;
+ };
+ SmallMapVector<Type *, Value *, 4> VecOps;
+ // Scales Vec using given Cnt scale factor and then performs vector combine
+ // with previous value of VecOp.
+ auto CreateVecOp = [&](Value *Vec, unsigned Cnt) {
+ Type *ScalarTy = cast<VectorType>(Vec->getType())->getElementType();
+ // Scale Vec using given Cnt scale factor.
+ if (Cnt > 1) {
+ ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
+ switch (RdxKind) {
+ case RecurKind::Add: {
+ if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
+ unsigned VF = getNumElements(Vec->getType());
+ LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
+ << ". (HorRdx)\n");
+ SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
+ for (unsigned I : seq<unsigned>(Cnt))
+ std::iota(std::next(Mask.begin(), VF * I),
+ std::next(Mask.begin(), VF * (I + 1)), 0);
+ ++NumVectorInstructions;
+ Vec = Builder.CreateShuffleVector(Vec, Mask);
+ break;
+ }
+ // res = mul vv, n
+ Value *Scale =
+ ConstantVector::getSplat(EC, ConstantInt::get(ScalarTy, Cnt));
+ LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
+ << ". (HorRdx)\n");
+ ++NumVectorInstructions;
+ Vec = Builder.CreateMul(Vec, Scale);
+ break;
+ }
+ case RecurKind::Xor: {
+ // res = n % 2 ? 0 : vv
+ LLVM_DEBUG(dbgs()
+ << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
+ if (Cnt % 2 == 0)
+ Vec = Constant::getNullValue(Vec->getType());
+ break;
+ }
+ case RecurKind::FAdd: {
+ // res = fmul v, n
+ Value *Scale =
+ ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
+ LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
+ << ". (HorRdx)\n");
+ ++NumVectorInstructions;
+ Vec = Builder.CreateFMul(Vec, Scale);
+ break;
+ }
+ case RecurKind::And:
+ case RecurKind::Or:
+ case RecurKind::SMax:
+ case RecurKind::SMin:
+ case RecurKind::UMax:
+ case RecurKind::UMin:
+ case RecurKind::FMax:
+ case RecurKind::FMin:
+ case RecurKind::FMaximum:
+ case RecurKind::FMinimum:
+ // res = vv
+ break;
+ case RecurKind::Mul:
+ case RecurKind::FMul:
+ case RecurKind::FMulAdd:
+ case RecurKind::IAnyOf:
+ case RecurKind::FAnyOf:
+ case RecurKind::None:
+ llvm_unreachable("Unexpected reduction kind for repeated scalar.");
+ }
+ }
+ // Combine Vec w...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are no tests changed by this review.
Forgot to update, will fix it |
Created using spr 1.3.5
✅ With the latest revision this PR passed the undef deprecator. |
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please can you add a patch description and not just the perf changes?
@@ -1611,6 +1611,10 @@ class TargetTransformInfo { | |||
/// split during legalization. Zero is returned when the answer is unknown. | |||
unsigned getNumberOfParts(Type *Tp) const; | |||
|
|||
/// \return true if \p Tp represent a type, fully occupying whole register, | |||
/// false otherwise. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Improve the description as it doesn't seem to match the implementation in BasicTTIImpl.h
; CHECK-NEXT: [[RDX_OP19:%.*]] = add <2 x i64> [[RDX_OP18]], [[TMP11]] | ||
; CHECK-NEXT: [[RDX_OP20:%.*]] = add <2 x i64> [[RDX_OP19]], [[TMP12]] | ||
; CHECK-NEXT: [[RDX_OP21:%.*]] = add <2 x i64> [[RDX_OP20]], [[TMP13]] | ||
; CHECK-NEXT: [[OP_RDX16:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[RDX_OP21]]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is all this better? it just feels like the reduction has been split to legal types
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of 2 reductions it now emits only 1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
but isn't all the additional extract / add precisely the same as the expansion of the original v8i64 reduction?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not exactly. Instead of final
%red1 = reduce_add1 %v1
%red2 = reduce_add2 %v2
%res = add i64 %red1, %red2
it will emit
%v = add<2 x i32> %v1, %v2
%res = reduce_add %v
which is slightly better for X86 and significantly better for other targets
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm still of the opinion that this would be better:
; CHECK-NEXT: [[ADD:%.*]] = add <8 x i64> [[TMP5]], [[TMP7]]
; CHECK-NEXT: [[OP_RDX16:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[ADD]])
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Compared the codegen and the throughput, they are almost the same. The modified version produces 3 less instructions and has slightly less rthroughput: 14.3 for the modified version and 15.0 for the original version for the generic cpu.
Created using spr 1.3.5
need to rebase |
I'm seeing a crash with this patch when compiling for my out-of-tree target:
I'll see if I can manage to reproduce for some in-tree target too. |
I'm going to revert the patch, please try to prepare the reproducer |
This reverts commit 2ad8166 to fix bug/miscompiles, reported in #118293 (comment) and #118293 (comment).
This reverts commit 2ad8166 to fix bug/miscompiles, reported in llvm/llvm-project#118293 (comment) and llvm/llvm-project#118293 (comment).
Fixed |
SLP vectorizer is able to combine several reductions from the list of (potentially) reduced values with the different opcodes/values kind. Currently, these reductions are handled independently of each other. But instead the compiler can combine them into wide vector operations and then perform only single reduction. E.g, if the SLP vectorizer emits currently something like: ``` %r1 = reduce.add(<4 x i32> %v1) %r2 = reduce.add(<4 x i32> %v2) %r = add i32 %r1, %r2 ``` it can be emitted as: ``` %v = add <4 x i32> %v1, %v2 %r = reduce.add(<4 x i32> %v) ``` It allows to improve the performance in some cases. AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0% Benchmarks/Shootout-C++ - same transformed reduction Adobe-C++/loop_unroll - same transformed reductions, new vector code AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions FreeBench/fourinarow - same transformed reductions MiBench/telecomm-gsm - same transformed reductions execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions CFP2006/433.milc - better vector code, several x i64 reductions + trunc to i32 gets trunced to x i32 reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions, extra 4 x vectorization CINT2006/464.h264ref - same transformed reductions CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same transformed reductions CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - transformed same reduction JM/lencod - extra 4 x vectorization RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0% test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2% test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4% test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4% test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4% execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same transformed reductions CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/automotive-susan - same transformed reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/telecomm-gsm - same transformed reductions Benchmarks/mediabench - same transformed reductions Vectorizer/VPlanNativePath - same transformed reductions Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions Regression/C/Regression-C-DuffsDevice - same transformed reductions Reviewers: hiraditya, topperc, preames Pull Request: #118293
This patch broke 2 bots:
clang crashes compiling
The full stacktrace is available in https://lab.llvm.org/buildbot/#/builders/143/builds/5429/steps/13/logs/stdio (search for The reproducer is attached. |
Should be fixed already with the new version of the patch |
SLP vectorizer is able to combine several reductions from the list of (potentially) reduced values with the different opcodes/values kind. Currently, these reductions are handled independently of each other. But instead the compiler can combine them into wide vector operations and then perform only single reduction. E.g, if the SLP vectorizer emits currently something like: ``` %r1 = reduce.add(<4 x i32> %v1) %r2 = reduce.add(<4 x i32> %v2) %r = add i32 %r1, %r2 ``` it can be emitted as: ``` %v = add <4 x i32> %v1, %v2 %r = reduce.add(<4 x i32> %v) ``` It allows to improve the performance in some cases. AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0% Benchmarks/Shootout-C++ - same transformed reduction Adobe-C++/loop_unroll - same transformed reductions, new vector code AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions FreeBench/fourinarow - same transformed reductions MiBench/telecomm-gsm - same transformed reductions execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions CFP2006/433.milc - better vector code, several x i64 reductions + trunc to i32 gets trunced to x i32 reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions, extra 4 x vectorization CINT2006/464.h264ref - same transformed reductions CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same transformed reductions CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - transformed same reduction JM/lencod - extra 4 x vectorization RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0% test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2% test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4% test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4% test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4% execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same transformed reductions CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/automotive-susan - same transformed reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/telecomm-gsm - same transformed reductions Benchmarks/mediabench - same transformed reductions Vectorizer/VPlanNativePath - same transformed reductions Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions Regression/C/Regression-C-DuffsDevice - same transformed reductions Reviewers: hiraditya, topperc, preames Pull Request: llvm/llvm-project#118293
Thanks for the fix. |
SLP vectorizer is able to combine several reductions from the list of (potentially) reduced values with the different opcodes/values kind. Currently, these reductions are handled independently of each other. But instead the compiler can combine them into wide vector operations and then perform only single reduction. E.g, if the SLP vectorizer emits currently something like: ``` %r1 = reduce.add(<4 x i32> %v1) %r2 = reduce.add(<4 x i32> %v2) %r = add i32 %r1, %r2 ``` it can be emitted as: ``` %v = add <4 x i32> %v1, %v2 %r = reduce.add(<4 x i32> %v) ``` It allows to improve the performance in some cases. AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0% Benchmarks/Shootout-C++ - same transformed reduction Adobe-C++/loop_unroll - same transformed reductions, new vector code AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions FreeBench/fourinarow - same transformed reductions MiBench/telecomm-gsm - same transformed reductions execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions CFP2006/433.milc - better vector code, several x i64 reductions + trunc to i32 gets trunced to x i32 reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions, extra 4 x vectorization CINT2006/464.h264ref - same transformed reductions CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same transformed reductions CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - transformed same reduction JM/lencod - extra 4 x vectorization RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0% test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2% test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4% test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4% test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4% execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same transformed reductions CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/automotive-susan - same transformed reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/telecomm-gsm - same transformed reductions Benchmarks/mediabench - same transformed reductions Vectorizer/VPlanNativePath - same transformed reductions Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions Regression/C/Regression-C-DuffsDevice - same transformed reductions Reviewers: hiraditya, topperc, preames Pull Request: #118293
SLP vectorizer is able to combine several reductions from the list of (potentially) reduced values with the different opcodes/values kind. Currently, these reductions are handled independently of each other. But instead the compiler can combine them into wide vector operations and then perform only single reduction. E.g, if the SLP vectorizer emits currently something like: ``` %r1 = reduce.add(<4 x i32> %v1) %r2 = reduce.add(<4 x i32> %v2) %r = add i32 %r1, %r2 ``` it can be emitted as: ``` %v = add <4 x i32> %v1, %v2 %r = reduce.add(<4 x i32> %v) ``` It allows to improve the performance in some cases. AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0% Benchmarks/Shootout-C++ - same transformed reduction Adobe-C++/loop_unroll - same transformed reductions, new vector code AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions FreeBench/fourinarow - same transformed reductions MiBench/telecomm-gsm - same transformed reductions execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions CFP2006/433.milc - better vector code, several x i64 reductions + trunc to i32 gets trunced to x i32 reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions, extra 4 x vectorization CINT2006/464.h264ref - same transformed reductions CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same transformed reductions CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - transformed same reduction JM/lencod - extra 4 x vectorization RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0% test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2% test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4% test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4% test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4% execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same transformed reductions CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/automotive-susan - same transformed reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/telecomm-gsm - same transformed reductions Benchmarks/mediabench - same transformed reductions Vectorizer/VPlanNativePath - same transformed reductions Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions Regression/C/Regression-C-DuffsDevice - same transformed reductions Reviewers: hiraditya, topperc, preames Pull Request: llvm/llvm-project#118293
SLP vectorizer is able to combine several reductions from the list of (potentially) reduced values with the different opcodes/values kind. Currently, these reductions are handled independently of each other. But instead the compiler can combine them into wide vector operations and then perform only single reduction. E.g, if the SLP vectorizer emits currently something like: ``` %r1 = reduce.add(<4 x i32> %v1) %r2 = reduce.add(<4 x i32> %v2) %r = add i32 %r1, %r2 ``` it can be emitted as: ``` %v = add <4 x i32> %v1, %v2 %r = reduce.add(<4 x i32> %v) ``` It allows to improve the performance in some cases. AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0% Benchmarks/Shootout-C++ - same transformed reduction Adobe-C++/loop_unroll - same transformed reductions, new vector code AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions FreeBench/fourinarow - same transformed reductions MiBench/telecomm-gsm - same transformed reductions execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions CFP2006/433.milc - better vector code, several x i64 reductions + trunc to i32 gets trunced to x i32 reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions, extra 4 x vectorization CINT2006/464.h264ref - same transformed reductions CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same transformed reductions CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - transformed same reduction JM/lencod - extra 4 x vectorization RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0% test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2% test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4% test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4% test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4% execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same transformed reductions CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/automotive-susan - same transformed reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/telecomm-gsm - same transformed reductions Benchmarks/mediabench - same transformed reductions Vectorizer/VPlanNativePath - same transformed reductions Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions Regression/C/Regression-C-DuffsDevice - same transformed reductions Reviewers: hiraditya, topperc, preames Pull Request: llvm#118293
This reverts commit 2ad8166 to fix bug/miscompiles, reported in llvm#118293 (comment) and llvm#118293 (comment).
SLP vectorizer is able to combine several reductions from the list of (potentially) reduced values with the different opcodes/values kind. Currently, these reductions are handled independently of each other. But instead the compiler can combine them into wide vector operations and then perform only single reduction. E.g, if the SLP vectorizer emits currently something like: ``` %r1 = reduce.add(<4 x i32> %v1) %r2 = reduce.add(<4 x i32> %v2) %r = add i32 %r1, %r2 ``` it can be emitted as: ``` %v = add <4 x i32> %v1, %v2 %r = reduce.add(<4 x i32> %v) ``` It allows to improve the performance in some cases. AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0% Benchmarks/Shootout-C++ - same transformed reduction Adobe-C++/loop_unroll - same transformed reductions, new vector code AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions FreeBench/fourinarow - same transformed reductions MiBench/telecomm-gsm - same transformed reductions execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions CFP2006/433.milc - better vector code, several x i64 reductions + trunc to i32 gets trunced to x i32 reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions, extra 4 x vectorization CINT2006/464.h264ref - same transformed reductions CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same transformed reductions CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - transformed same reduction JM/lencod - extra 4 x vectorization RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0% test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2% test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4% test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4% test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4% execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same transformed reductions CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/automotive-susan - same transformed reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/telecomm-gsm - same transformed reductions Benchmarks/mediabench - same transformed reductions Vectorizer/VPlanNativePath - same transformed reductions Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions Regression/C/Regression-C-DuffsDevice - same transformed reductions Reviewers: hiraditya, topperc, preames Pull Request: llvm#118293
SLP vectorizer is able to combine several reductions from the list of (potentially) reduced values with the different opcodes/values kind. Currently, these reductions are handled independently of each other. But instead the compiler can combine them into wide vector operations and then perform only single reduction. E.g, if the SLP vectorizer emits currently something like: ``` %r1 = reduce.add(<4 x i32> %v1) %r2 = reduce.add(<4 x i32> %v2) %r = add i32 %r1, %r2 ``` it can be emitted as: ``` %v = add <4 x i32> %v1, %v2 %r = reduce.add(<4 x i32> %v) ``` It allows to improve the performance in some cases. AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0% Benchmarks/Shootout-C++ - same transformed reduction Adobe-C++/loop_unroll - same transformed reductions, new vector code AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions FreeBench/fourinarow - same transformed reductions MiBench/telecomm-gsm - same transformed reductions execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions CFP2006/433.milc - better vector code, several x i64 reductions + trunc to i32 gets trunced to x i32 reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions, extra 4 x vectorization CINT2006/464.h264ref - same transformed reductions CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same transformed reductions CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - transformed same reduction JM/lencod - extra 4 x vectorization RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0% test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2% test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4% test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4% test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4% execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same transformed reductions CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/automotive-susan - same transformed reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/telecomm-gsm - same transformed reductions Benchmarks/mediabench - same transformed reductions Vectorizer/VPlanNativePath - same transformed reductions Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions Regression/C/Regression-C-DuffsDevice - same transformed reductions Reviewers: hiraditya, topperc, preames Pull Request: llvm#118293
The problem I saw disappeared with 3b18d47. |
SLP vectorizer is able to combine several reductions from the list of (potentially) reduced values with the different opcodes/values kind. Currently, these reductions are handled independently of each other. But instead the compiler can combine them into wide vector operations and then perform only single reduction. E.g, if the SLP vectorizer emits currently something like: ``` %r1 = reduce.add(<4 x i32> %v1) %r2 = reduce.add(<4 x i32> %v2) %r = add i32 %r1, %r2 ``` it can be emitted as: ``` %v = add <4 x i32> %v1, %v2 %r = reduce.add(<4 x i32> %v) ``` It allows to improve the performance in some cases. AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0% Benchmarks/Shootout-C++ - same transformed reduction Adobe-C++/loop_unroll - same transformed reductions, new vector code AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions FreeBench/fourinarow - same transformed reductions MiBench/telecomm-gsm - same transformed reductions execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions CFP2006/433.milc - better vector code, several x i64 reductions + trunc to i32 gets trunced to x i32 reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions, extra 4 x vectorization CINT2006/464.h264ref - same transformed reductions CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same transformed reductions CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - transformed same reduction JM/lencod - extra 4 x vectorization RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0% test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2% test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4% test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4% test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4% execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same transformed reductions CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/automotive-susan - same transformed reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/telecomm-gsm - same transformed reductions Benchmarks/mediabench - same transformed reductions Vectorizer/VPlanNativePath - same transformed reductions Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions Regression/C/Regression-C-DuffsDevice - same transformed reductions Reviewers: hiraditya, topperc, preames Pull Request: llvm#118293
This reverts commit 2ad8166 to fix bug/miscompiles, reported in llvm#118293 (comment) and llvm#118293 (comment).
SLP vectorizer is able to combine several reductions from the list of (potentially) reduced values with the different opcodes/values kind. Currently, these reductions are handled independently of each other. But instead the compiler can combine them into wide vector operations and then perform only single reduction. E.g, if the SLP vectorizer emits currently something like: ``` %r1 = reduce.add(<4 x i32> %v1) %r2 = reduce.add(<4 x i32> %v2) %r = add i32 %r1, %r2 ``` it can be emitted as: ``` %v = add <4 x i32> %v1, %v2 %r = reduce.add(<4 x i32> %v) ``` It allows to improve the performance in some cases. AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0% Benchmarks/Shootout-C++ - same transformed reduction Adobe-C++/loop_unroll - same transformed reductions, new vector code AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions FreeBench/fourinarow - same transformed reductions MiBench/telecomm-gsm - same transformed reductions execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions CFP2006/433.milc - better vector code, several x i64 reductions + trunc to i32 gets trunced to x i32 reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions, extra 4 x vectorization CINT2006/464.h264ref - same transformed reductions CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same transformed reductions CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - transformed same reduction JM/lencod - extra 4 x vectorization RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0% test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2% test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4% test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4% test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4% execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same transformed reductions CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/automotive-susan - same transformed reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/telecomm-gsm - same transformed reductions Benchmarks/mediabench - same transformed reductions Vectorizer/VPlanNativePath - same transformed reductions Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions Regression/C/Regression-C-DuffsDevice - same transformed reductions Reviewers: hiraditya, topperc, preames Pull Request: llvm#118293
SLP vectorizer is able to combine several reductions from the list of (potentially) reduced values with the different opcodes/values kind. Currently, these reductions are handled independently of each other. But instead the compiler can combine them into wide vector operations and then perform only single reduction. E.g, if the SLP vectorizer emits currently something like: ``` %r1 = reduce.add(<4 x i32> %v1) %r2 = reduce.add(<4 x i32> %v2) %r = add i32 %r1, %r2 ``` it can be emitted as: ``` %v = add <4 x i32> %v1, %v2 %r = reduce.add(<4 x i32> %v) ``` It allows to improve the performance in some cases. AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0% test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0% Benchmarks/Shootout-C++ - same transformed reduction Adobe-C++/loop_unroll - same transformed reductions, new vector code AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions FreeBench/fourinarow - same transformed reductions MiBench/telecomm-gsm - same transformed reductions execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions CFP2006/433.milc - better vector code, several x i64 reductions + trunc to i32 gets trunced to x i32 reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions, extra 4 x vectorization CINT2006/464.h264ref - same transformed reductions CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same transformed reductions CINT2017speed/600.perlbench_s CINT2017rate/500.perlbench_r - transformed same reduction JM/lencod - extra 4 x vectorization RISC-V, SiFive-p670, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0% test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0% test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2% test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3% test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4% test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4% test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4% execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same transformed reductions CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/automotive-susan - same transformed reductions ImageProcessing/Blur - same transformed reductions Benchmarks/7zip - same transformed reductions CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop) MiBench/telecomm-gsm - same transformed reductions Benchmarks/mediabench - same transformed reductions Vectorizer/VPlanNativePath - same transformed reductions Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions Regression/C/Regression-C-DuffsDevice - same transformed reductions Reviewers: hiraditya, topperc, preames Pull Request: llvm#118293
SLP vectorizer is able to combine several reductions from the list of
(potentially) reduced values with the different opcodes/values kind.
Currently, these reductions are handled independently of each other. But
instead the compiler can combine them into wide vector operations and
then perform only single reduction.
E.g, if the SLP vectorizer emits currently something like:
it can be emitted as:
It allows to improve the performance in some cases.
AVX512, -O3+LTO
Metric: size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 4553.00 4615.00 1.4%
test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 412708.00 416820.00 1.0%
test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test 12901.00 12981.00 0.6%
test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test 22717.00 22813.00 0.4%
test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39722.00 39850.00 0.3%
test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39725.00 39853.00 0.3%
test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 15918.00 15967.00 0.3%
test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 155491.00 155587.00 0.1%
test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 227894.00 227942.00 0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1062188.00 1062364.00 0.0%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 793672.00 793720.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 657371.00 657403.00 0.0%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 657371.00 657403.00 0.0%
test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2074917.00 2074933.00 0.0%
test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2074917.00 2074933.00 0.0%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 855219.00 855203.00 -0.0%
Benchmarks/Shootout-C++ - same transformed reduction
Adobe-C++/loop_unroll - same transformed reductions, new vector code
AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions
FreeBench/fourinarow - same transformed reductions
MiBench/telecomm-gsm - same transformed reductions
execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions
CFP2006/433.milc - better vector code, several x i64 reductions + trunc
to i32 gets trunced to x i32 reductions
ImageProcessing/Blur - same transformed reductions
Benchmarks/7zip - same transformed reductions, extra 4 x vectorization
CINT2006/464.h264ref - same transformed reductions
CINT2017rate/525.x264_r
CINT2017speed/625.x264_s - same transformed reductions
CINT2017speed/600.perlbench_s
CINT2017rate/500.perlbench_r - transformed same reduction
JM/lencod - extra 4 x vectorization
RISC-V, SiFive-p670, -O3+LTO
Metric: size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 8990.00 9514.00 5.8%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 588504.00 588488.00 -0.0%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 147464.00 147440.00 -0.0%
test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test 21496.00 21492.00 -0.0%
test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test 165420.00 165372.00 -0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 843928.00 843648.00 -0.0%
test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test 100712.00 100672.00 -0.0%
test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 24384.00 24336.00 -0.2%
test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 24380.00 24332.00 -0.2%
test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test 10348.00 10316.00 -0.3%
test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 221304.00 220480.00 -0.4%
test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test 3750.00 3736.00 -0.4%
test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test 678.00 370.00 -45.4%
execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same
transformed reductions
CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions
MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop)
MiBench/automotive-susan - same transformed reductions
ImageProcessing/Blur - same transformed reductions
Benchmarks/7zip - same transformed reductions
CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast + ctpop)
MiBench/telecomm-gsm - same transformed reductions
Benchmarks/mediabench - same transformed reductions
Vectorizer/VPlanNativePath - same transformed reductions
Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions
Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions
Regression/C/Regression-C-DuffsDevice - same transformed reductions