Skip to content

Commit 866b9f4

Browse files
[SystemZ] Add realistic cost estimates for vector reduction intrinsics (#118319)
This PR adds more realistic cost estimates for these reduction intrinsics - `llvm.vector.reduce.umax` - `llvm.vector.reduce.umin` - `llvm.vector.reduce.smax` - `llvm.vector.reduce.smin` - `llvm.vector.reduce.fadd` - `llvm.vector.reduce.fmul` - `llvm.vector.reduce.fmax` - `llvm.vector.reduce.fmin` - `llvm.vector.reduce.fmaximum` - `llvm.vector.reduce.fminimum` - `llvm.vector.reduce.mul ` The pre-existing cost estimates for `llvm.vector.reduce.add` are moved to `getArithmeticReductionCosts` to reduce complexity in `getVectorIntrinsicInstrCost` and enable other passes, like the SLP vectorizer, to benefit from these updated calculations. These are not expected to provide noticable performance improvements and are rather provided for the sake of completeness and correctness. This PR is in draft mode pending benchmark confirmation of this. This also provides and/or updates cost tests for all of these intrinsics. This PR was co-authored by me and @JonPsson1 .
1 parent c7babfa commit 866b9f4

File tree

7 files changed

+1245
-146
lines changed

7 files changed

+1245
-146
lines changed

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

Lines changed: 75 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "llvm/IR/IntrinsicInst.h"
2222
#include "llvm/IR/Intrinsics.h"
2323
#include "llvm/Support/Debug.h"
24+
#include "llvm/Support/InstructionCost.h"
2425
#include "llvm/Support/MathExtras.h"
2526

2627
using namespace llvm;
@@ -1396,30 +1397,86 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
13961397
return NumVectorMemOps + NumPermutes;
13971398
}
13981399

1400+
InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) {
1401+
InstructionCost Cost = 0;
1402+
// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1403+
Cost += NumVec - 1;
1404+
// For integer adds, VSUM creates shorter reductions on the final vector.
1405+
Cost += (ScalarBits < 32) ? 3 : 2;
1406+
return Cost;
1407+
}
1408+
1409+
InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems,
1410+
unsigned ScalarBits) {
1411+
unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
1412+
InstructionCost Cost = 0;
1413+
// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1414+
Cost += NumVec - 1;
1415+
// For each shuffle / arithmetic layer, we need 2 instructions, and we need
1416+
// log2(Elements in Last Vector) layers.
1417+
Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg));
1418+
return Cost;
1419+
}
1420+
1421+
inline bool customCostReductions(unsigned Opcode) {
1422+
return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
1423+
Opcode == Instruction::Add || Opcode == Instruction::Mul;
1424+
}
1425+
1426+
InstructionCost
1427+
SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1428+
std::optional<FastMathFlags> FMF,
1429+
TTI::TargetCostKind CostKind) {
1430+
unsigned ScalarBits = Ty->getScalarSizeInBits();
1431+
// The following is only for subtargets with vector math, non-ordered
1432+
// reductions, and reasonable scalar sizes for int and fp add/mul.
1433+
if (customCostReductions(Opcode) && ST->hasVector() &&
1434+
!TTI::requiresOrderedReduction(FMF) &&
1435+
ScalarBits <= SystemZ::VectorBits) {
1436+
unsigned NumVectors = getNumVectorRegs(Ty);
1437+
unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1438+
// Integer Add is using custom code gen, that needs to be accounted for.
1439+
if (Opcode == Instruction::Add)
1440+
return getIntAddReductionCost(NumVectors, ScalarBits);
1441+
// The base cost is the same across all other arithmetic instructions
1442+
InstructionCost Cost =
1443+
getFastReductionCost(NumVectors, NumElems, ScalarBits);
1444+
// But we need to account for the final op involving the scalar operand.
1445+
if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul))
1446+
Cost += 1;
1447+
return Cost;
1448+
}
1449+
// otherwise, fall back to the standard implementation
1450+
return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1451+
}
1452+
1453+
InstructionCost
1454+
SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1455+
FastMathFlags FMF,
1456+
TTI::TargetCostKind CostKind) {
1457+
// Return custom costs only on subtargets with vector enhancements.
1458+
if (ST->hasVectorEnhancements1()) {
1459+
unsigned NumVectors = getNumVectorRegs(Ty);
1460+
unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1461+
unsigned ScalarBits = Ty->getScalarSizeInBits();
1462+
InstructionCost Cost = 0;
1463+
// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1464+
Cost += NumVectors - 1;
1465+
// For the final vector, we need shuffle + min/max operations, and
1466+
// we need #Elements - 1 of them.
1467+
Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1);
1468+
return Cost;
1469+
}
1470+
// For other targets, fall back to the standard implementation
1471+
return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1472+
}
1473+
13991474
static int
14001475
getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
14011476
const SmallVectorImpl<Type *> &ParamTys) {
14021477
if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
14031478
return getNumVectorRegs(RetTy); // VPERM
14041479

1405-
if (ID == Intrinsic::vector_reduce_add) {
1406-
// Retrieve number and size of elements for the vector op.
1407-
auto *VTy = cast<FixedVectorType>(ParamTys.front());
1408-
unsigned ScalarSize = VTy->getScalarSizeInBits();
1409-
// For scalar sizes >128 bits, we fall back to the generic cost estimate.
1410-
if (ScalarSize > SystemZ::VectorBits)
1411-
return -1;
1412-
// This many vector regs are needed to represent the input elements (V).
1413-
unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
1414-
// This many instructions are needed for the final sum of vector elems (S).
1415-
unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2;
1416-
// We use vector adds to create a sum vector, which takes
1417-
// V/2 + V/4 + ... = V - 1 operations.
1418-
// Then, we need S operations to sum up the elements of that sum vector,
1419-
// for a total of V + S - 1 operations.
1420-
int Cost = VectorRegsNeeded + LastVectorHandling - 1;
1421-
return Cost;
1422-
}
14231480
return -1;
14241481
}
14251482

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,13 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
130130
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
131131
bool UseMaskForCond = false, bool UseMaskForGaps = false);
132132

133+
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
134+
std::optional<FastMathFlags> FMF,
135+
TTI::TargetCostKind CostKind);
136+
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
137+
FastMathFlags FMF,
138+
TTI::TargetCostKind CostKind);
139+
133140
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
134141
TTI::TargetCostKind CostKind);
135142

llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll

Lines changed: 0 additions & 128 deletions
This file was deleted.

0 commit comments

Comments
 (0)