|
21 | 21 | #include "llvm/IR/IntrinsicInst.h"
|
22 | 22 | #include "llvm/IR/Intrinsics.h"
|
23 | 23 | #include "llvm/Support/Debug.h"
|
| 24 | +#include "llvm/Support/InstructionCost.h" |
24 | 25 | #include "llvm/Support/MathExtras.h"
|
25 | 26 |
|
26 | 27 | using namespace llvm;
|
@@ -1396,30 +1397,86 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
|
1396 | 1397 | return NumVectorMemOps + NumPermutes;
|
1397 | 1398 | }
|
1398 | 1399 |
|
| 1400 | +InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) { |
| 1401 | + InstructionCost Cost = 0; |
| 1402 | + // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. |
| 1403 | + Cost += NumVec - 1; |
| 1404 | + // For integer adds, VSUM creates shorter reductions on the final vector. |
| 1405 | + Cost += (ScalarBits < 32) ? 3 : 2; |
| 1406 | + return Cost; |
| 1407 | +} |
| 1408 | + |
| 1409 | +InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems, |
| 1410 | + unsigned ScalarBits) { |
| 1411 | + unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits); |
| 1412 | + InstructionCost Cost = 0; |
| 1413 | + // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. |
| 1414 | + Cost += NumVec - 1; |
| 1415 | + // For each shuffle / arithmetic layer, we need 2 instructions, and we need |
| 1416 | + // log2(Elements in Last Vector) layers. |
| 1417 | + Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg)); |
| 1418 | + return Cost; |
| 1419 | +} |
| 1420 | + |
| 1421 | +inline bool customCostReductions(unsigned Opcode) { |
| 1422 | + return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || |
| 1423 | + Opcode == Instruction::Add || Opcode == Instruction::Mul; |
| 1424 | +} |
| 1425 | + |
| 1426 | +InstructionCost |
| 1427 | +SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
| 1428 | + std::optional<FastMathFlags> FMF, |
| 1429 | + TTI::TargetCostKind CostKind) { |
| 1430 | + unsigned ScalarBits = Ty->getScalarSizeInBits(); |
| 1431 | + // The following is only for subtargets with vector math, non-ordered |
| 1432 | + // reductions, and reasonable scalar sizes for int and fp add/mul. |
| 1433 | + if (customCostReductions(Opcode) && ST->hasVector() && |
| 1434 | + !TTI::requiresOrderedReduction(FMF) && |
| 1435 | + ScalarBits <= SystemZ::VectorBits) { |
| 1436 | + unsigned NumVectors = getNumVectorRegs(Ty); |
| 1437 | + unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); |
| 1438 | + // Integer Add is using custom code gen, that needs to be accounted for. |
| 1439 | + if (Opcode == Instruction::Add) |
| 1440 | + return getIntAddReductionCost(NumVectors, ScalarBits); |
| 1441 | + // The base cost is the same across all other arithmetic instructions |
| 1442 | + InstructionCost Cost = |
| 1443 | + getFastReductionCost(NumVectors, NumElems, ScalarBits); |
| 1444 | + // But we need to account for the final op involving the scalar operand. |
| 1445 | + if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul)) |
| 1446 | + Cost += 1; |
| 1447 | + return Cost; |
| 1448 | + } |
| 1449 | + // otherwise, fall back to the standard implementation |
| 1450 | + return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
| 1451 | +} |
| 1452 | + |
| 1453 | +InstructionCost |
| 1454 | +SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
| 1455 | + FastMathFlags FMF, |
| 1456 | + TTI::TargetCostKind CostKind) { |
| 1457 | + // Return custom costs only on subtargets with vector enhancements. |
| 1458 | + if (ST->hasVectorEnhancements1()) { |
| 1459 | + unsigned NumVectors = getNumVectorRegs(Ty); |
| 1460 | + unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); |
| 1461 | + unsigned ScalarBits = Ty->getScalarSizeInBits(); |
| 1462 | + InstructionCost Cost = 0; |
| 1463 | + // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. |
| 1464 | + Cost += NumVectors - 1; |
| 1465 | + // For the final vector, we need shuffle + min/max operations, and |
| 1466 | + // we need #Elements - 1 of them. |
| 1467 | + Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1); |
| 1468 | + return Cost; |
| 1469 | + } |
| 1470 | + // For other targets, fall back to the standard implementation |
| 1471 | + return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
| 1472 | +} |
| 1473 | + |
1399 | 1474 | static int
|
1400 | 1475 | getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
1401 | 1476 | const SmallVectorImpl<Type *> &ParamTys) {
|
1402 | 1477 | if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
|
1403 | 1478 | return getNumVectorRegs(RetTy); // VPERM
|
1404 | 1479 |
|
1405 |
| - if (ID == Intrinsic::vector_reduce_add) { |
1406 |
| - // Retrieve number and size of elements for the vector op. |
1407 |
| - auto *VTy = cast<FixedVectorType>(ParamTys.front()); |
1408 |
| - unsigned ScalarSize = VTy->getScalarSizeInBits(); |
1409 |
| - // For scalar sizes >128 bits, we fall back to the generic cost estimate. |
1410 |
| - if (ScalarSize > SystemZ::VectorBits) |
1411 |
| - return -1; |
1412 |
| - // This many vector regs are needed to represent the input elements (V). |
1413 |
| - unsigned VectorRegsNeeded = getNumVectorRegs(VTy); |
1414 |
| - // This many instructions are needed for the final sum of vector elems (S). |
1415 |
| - unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2; |
1416 |
| - // We use vector adds to create a sum vector, which takes |
1417 |
| - // V/2 + V/4 + ... = V - 1 operations. |
1418 |
| - // Then, we need S operations to sum up the elements of that sum vector, |
1419 |
| - // for a total of V + S - 1 operations. |
1420 |
| - int Cost = VectorRegsNeeded + LastVectorHandling - 1; |
1421 |
| - return Cost; |
1422 |
| - } |
1423 | 1480 | return -1;
|
1424 | 1481 | }
|
1425 | 1482 |
|
|
0 commit comments