Skip to content

Commit 5e3db3c

Browse files
committed
[AArch64] Add getVectorInstrCost Codesize costs handling.
We have a lot of missing Codesize costs for vector operations. This patch starts things off by adding codesize costs for getVectorInstrCost, returning a single cost instead of the VectorInsertExtractBaseCost (which is typically 2). Insert of a load are given a cost of 0 as they use ld1, otherwise the cost is 1.
1 parent adb44ed commit 5e3db3c

File tree

8 files changed

+123
-118
lines changed

8 files changed

+123
-118
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3302,8 +3302,8 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
33023302
}
33033303

33043304
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3305-
unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
3306-
const Instruction *I, Value *Scalar,
3305+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3306+
bool HasRealUse, const Instruction *I, Value *Scalar,
33073307
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
33083308
assert(Val->isVectorTy() && "This must be a vector type");
33093309

@@ -3336,12 +3336,16 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
33363336
// and its second operand is a load, then we will generate a LD1, which
33373337
// are expensive instructions.
33383338
if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3339-
return ST->getVectorInsertExtractBaseCost() + 1;
3339+
return CostKind == TTI::TCK_CodeSize
3340+
? 0
3341+
: ST->getVectorInsertExtractBaseCost() + 1;
33403342

33413343
// i1 inserts and extract will include an extra cset or cmp of the vector
33423344
// value. Increase the cost by 1 to account.
33433345
if (Val->getScalarSizeInBits() == 1)
3344-
return ST->getVectorInsertExtractBaseCost() + 1;
3346+
return CostKind == TTI::TCK_CodeSize
3347+
? 2
3348+
: ST->getVectorInsertExtractBaseCost() + 1;
33453349

33463350
// FIXME:
33473351
// If the extract-element and insert-element instructions could be
@@ -3465,7 +3469,8 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
34653469
return 0;
34663470

34673471
// All other insert/extracts cost this much.
3468-
return ST->getVectorInsertExtractBaseCost();
3472+
return CostKind == TTI::TCK_CodeSize ? 1
3473+
: ST->getVectorInsertExtractBaseCost();
34693474
}
34703475

34713476
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
@@ -3474,22 +3479,22 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
34743479
Value *Op1) {
34753480
bool HasRealUse =
34763481
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3477-
return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse);
3482+
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
34783483
}
34793484

34803485
InstructionCost AArch64TTIImpl::getVectorInstrCost(
34813486
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
34823487
Value *Scalar,
34833488
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3484-
return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar,
3485-
ScalarUserAndIdx);
3489+
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
3490+
Scalar, ScalarUserAndIdx);
34863491
}
34873492

34883493
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
34893494
Type *Val,
34903495
TTI::TargetCostKind CostKind,
34913496
unsigned Index) {
3492-
return getVectorInstrCostHelper(I.getOpcode(), Val, Index,
3497+
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
34933498
true /* HasRealUse */, &I);
34943499
}
34953500

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
7373
/// of the extract(nullptr if user is not known before vectorization) and
7474
/// 'Idx' being the extract lane.
7575
InstructionCost getVectorInstrCostHelper(
76-
unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
77-
const Instruction *I = nullptr, Value *Scalar = nullptr,
76+
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
77+
bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
7878
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {});
7979

8080
public:

llvm/test/Analysis/CostModel/AArch64/arith-fp.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -536,9 +536,9 @@ define void @fsqrt() {
536536
define void @fsqrt_fp16() {
537537
; CHECK-BASE-LABEL: 'fsqrt_fp16'
538538
; CHECK-BASE-NEXT: Cost Model: Found costs of 1 for: %F16 = call half @llvm.sqrt.f16(half undef)
539-
; CHECK-BASE-NEXT: Cost Model: Found costs of 10 for: %V4F16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
540-
; CHECK-BASE-NEXT: Cost Model: Found costs of 22 for: %V8F16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
541-
; CHECK-BASE-NEXT: Cost Model: Found costs of 44 for: %V16F16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
539+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V4F16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
540+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %V8F16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
541+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:44 CodeSize:30 Lat:44 SizeLat:44 for: %V16F16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
542542
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
543543
;
544544
; CHECK-FP16-LABEL: 'fsqrt_fp16'
@@ -679,9 +679,9 @@ define void @fma() {
679679
define void @fma_fp16() {
680680
; CHECK-BASE-LABEL: 'fma_fp16'
681681
; CHECK-BASE-NEXT: Cost Model: Found costs of 1 for: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef)
682-
; CHECK-BASE-NEXT: Cost Model: Found costs of 10 for: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
683-
; CHECK-BASE-NEXT: Cost Model: Found costs of 22 for: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
684-
; CHECK-BASE-NEXT: Cost Model: Found costs of 44 for: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
682+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
683+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
684+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:44 CodeSize:30 Lat:44 SizeLat:44 for: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
685685
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
686686
;
687687
; CHECK-FP16-LABEL: 'fma_fp16'

llvm/test/Analysis/CostModel/AArch64/insert-extract.ll

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,38 +11,38 @@ target triple = "aarch64--linux-gnu"
1111

1212
define void @vectorInstrCost() {
1313
; CHECK-LABEL: 'vectorInstrCost'
14-
; CHECK-NEXT: Cost Model: Found costs of 3 for: %ta0 = extractelement <8 x i1> undef, i32 0
15-
; CHECK-NEXT: Cost Model: Found costs of 3 for: %ta1 = extractelement <8 x i1> undef, i32 1
16-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t1 = extractelement <8 x i8> undef, i32 0
17-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t2 = extractelement <8 x i8> undef, i32 1
18-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t3 = extractelement <4 x i16> undef, i32 0
19-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t4 = extractelement <4 x i16> undef, i32 1
20-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t5 = extractelement <2 x i32> undef, i32 0
21-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t6 = extractelement <2 x i32> undef, i32 1
22-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t7 = extractelement <2 x i64> undef, i32 0
23-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t8 = extractelement <2 x i64> undef, i32 1
14+
; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %ta0 = extractelement <8 x i1> undef, i32 0
15+
; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %ta1 = extractelement <8 x i1> undef, i32 1
16+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t1 = extractelement <8 x i8> undef, i32 0
17+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t2 = extractelement <8 x i8> undef, i32 1
18+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t3 = extractelement <4 x i16> undef, i32 0
19+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t4 = extractelement <4 x i16> undef, i32 1
20+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t5 = extractelement <2 x i32> undef, i32 0
21+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t6 = extractelement <2 x i32> undef, i32 1
22+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t7 = extractelement <2 x i64> undef, i32 0
23+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t8 = extractelement <2 x i64> undef, i32 1
2424
; CHECK-NEXT: Cost Model: Found costs of 0 for: %t9 = extractelement <4 x half> undef, i32 0
25-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t10 = extractelement <4 x half> undef, i32 1
25+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t10 = extractelement <4 x half> undef, i32 1
2626
; CHECK-NEXT: Cost Model: Found costs of 0 for: %t11 = extractelement <2 x float> undef, i32 0
27-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t12 = extractelement <2 x float> undef, i32 1
27+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t12 = extractelement <2 x float> undef, i32 1
2828
; CHECK-NEXT: Cost Model: Found costs of 0 for: %t13 = extractelement <2 x double> undef, i32 0
29-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t14 = extractelement <2 x double> undef, i32 1
30-
; CHECK-NEXT: Cost Model: Found costs of 3 for: %t31 = insertelement <8 x i1> undef, i1 false, i32 0
31-
; CHECK-NEXT: Cost Model: Found costs of 3 for: %t41 = insertelement <8 x i1> undef, i1 true, i32 1
32-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t30 = insertelement <8 x i8> undef, i8 0, i32 0
33-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t40 = insertelement <8 x i8> undef, i8 1, i32 1
34-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t50 = insertelement <4 x i16> undef, i16 2, i32 0
35-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t60 = insertelement <4 x i16> undef, i16 3, i32 1
36-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t70 = insertelement <2 x i32> undef, i32 4, i32 0
37-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t80 = insertelement <2 x i32> undef, i32 5, i32 1
38-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t90 = insertelement <2 x i64> undef, i64 6, i32 0
39-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t100 = insertelement <2 x i64> undef, i64 7, i32 1
29+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t14 = extractelement <2 x double> undef, i32 1
30+
; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %t31 = insertelement <8 x i1> undef, i1 false, i32 0
31+
; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %t41 = insertelement <8 x i1> undef, i1 true, i32 1
32+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t30 = insertelement <8 x i8> undef, i8 0, i32 0
33+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t40 = insertelement <8 x i8> undef, i8 1, i32 1
34+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t50 = insertelement <4 x i16> undef, i16 2, i32 0
35+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t60 = insertelement <4 x i16> undef, i16 3, i32 1
36+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t70 = insertelement <2 x i32> undef, i32 4, i32 0
37+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t80 = insertelement <2 x i32> undef, i32 5, i32 1
38+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t90 = insertelement <2 x i64> undef, i64 6, i32 0
39+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t100 = insertelement <2 x i64> undef, i64 7, i32 1
4040
; CHECK-NEXT: Cost Model: Found costs of 0 for: %t110 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 0
41-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1
41+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1
4242
; CHECK-NEXT: Cost Model: Found costs of 0 for: %t130 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0
43-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1
43+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1
4444
; CHECK-NEXT: Cost Model: Found costs of 0 for: %t150 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 0
45-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1
45+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1
4646
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
4747
;
4848
%ta0 = extractelement <8 x i1> undef, i32 0
@@ -86,7 +86,7 @@ define void @vectorInstrCost() {
8686
define <8 x i8> @LD1_B(<8 x i8> %vec, ptr noundef %i) {
8787
; CHECK-LABEL: 'LD1_B'
8888
; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i8, ptr %i, align 1
89-
; CHECK-NEXT: Cost Model: Found costs of 3 for: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
89+
; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
9090
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %v2
9191
;
9292
entry:
@@ -98,7 +98,7 @@ entry:
9898
define <4 x i16> @LD1_H(<4 x i16> %vec, ptr noundef %i) {
9999
; CHECK-LABEL: 'LD1_H'
100100
; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i16, ptr %i, align 2
101-
; CHECK-NEXT: Cost Model: Found costs of 3 for: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
101+
; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
102102
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %v2
103103
;
104104
entry:
@@ -110,7 +110,7 @@ entry:
110110
define <4 x i32> @LD1_W(<4 x i32> %vec, ptr noundef %i) {
111111
; CHECK-LABEL: 'LD1_W'
112112
; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i32, ptr %i, align 4
113-
; CHECK-NEXT: Cost Model: Found costs of 3 for: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
113+
; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
114114
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %v2
115115
;
116116
entry:
@@ -122,7 +122,7 @@ entry:
122122
define <2 x i64> @LD1_X(<2 x i64> %vec, ptr noundef %i) {
123123
; CHECK-LABEL: 'LD1_X'
124124
; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i64, ptr %i, align 8
125-
; CHECK-NEXT: Cost Model: Found costs of 3 for: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
125+
; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
126126
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %v2
127127
;
128128
entry:

0 commit comments

Comments
 (0)