Skip to content

Commit 2f7f0e7

Browse files
committed
[CostModel][X86] Updated reverse shuffle costs
llvm-svn: 289819
1 parent 4160264 commit 2f7f0e7

File tree

2 files changed

+151
-37
lines changed

2 files changed

+151
-37
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 95 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -604,12 +604,102 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
604604

605605
if (Kind == TTI::SK_Reverse) {
606606
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
607-
int Cost = 1;
608-
if (LT.second.getSizeInBits() > 128)
609-
Cost = 3; // Extract + insert + copy.
610607

611-
// Multiple by the number of parts.
612-
return Cost * LT.first;
608+
static const CostTblEntry AVX512VBMIShuffleTbl[] = {
609+
{ ISD::VECTOR_SHUFFLE, MVT::v64i8, 1 }, // vpermb
610+
{ ISD::VECTOR_SHUFFLE, MVT::v32i8, 1 } // vpermb
611+
};
612+
613+
if (ST->hasVBMI())
614+
if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
615+
ISD::VECTOR_SHUFFLE, LT.second))
616+
return LT.first * Entry->Cost;
617+
618+
static const CostTblEntry AVX512BWShuffleTbl[] = {
619+
{ ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpermw
620+
{ ISD::VECTOR_SHUFFLE, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128
621+
// + 2*pshufb + vinserti64x4
622+
};
623+
624+
if (ST->hasBWI())
625+
if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
626+
ISD::VECTOR_SHUFFLE, LT.second))
627+
return LT.first * Entry->Cost;
628+
629+
static const CostTblEntry AVX512ShuffleTbl[] = {
630+
{ ISD::VECTOR_SHUFFLE, MVT::v8f64, 1 }, // vpermpd
631+
{ ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vpermps
632+
{ ISD::VECTOR_SHUFFLE, MVT::v8i64, 1 }, // vpermq
633+
{ ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 }, // vpermd
634+
};
635+
636+
if (ST->hasAVX512())
637+
if (const auto *Entry =
638+
CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
639+
return LT.first * Entry->Cost;
640+
641+
static const CostTblEntry AVX2ShuffleTbl[] = {
642+
{ ISD::VECTOR_SHUFFLE, MVT::v4f64, 1 }, // vpermpd
643+
{ ISD::VECTOR_SHUFFLE, MVT::v8f32, 1 }, // vpermps
644+
{ ISD::VECTOR_SHUFFLE, MVT::v4i64, 1 }, // vpermq
645+
{ ISD::VECTOR_SHUFFLE, MVT::v8i32, 1 }, // vpermd
646+
{ ISD::VECTOR_SHUFFLE, MVT::v16i16, 2 }, // vperm2i128 + pshufb
647+
{ ISD::VECTOR_SHUFFLE, MVT::v32i8, 2 } // vperm2i128 + pshufb
648+
};
649+
650+
if (ST->hasAVX2())
651+
if (const auto *Entry =
652+
CostTableLookup(AVX2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
653+
return LT.first * Entry->Cost;
654+
655+
static const CostTblEntry AVX1ShuffleTbl[] = {
656+
{ ISD::VECTOR_SHUFFLE, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
657+
{ ISD::VECTOR_SHUFFLE, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
658+
{ ISD::VECTOR_SHUFFLE, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
659+
{ ISD::VECTOR_SHUFFLE, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
660+
{ ISD::VECTOR_SHUFFLE, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
661+
// + vinsertf128
662+
{ ISD::VECTOR_SHUFFLE, MVT::v32i8, 4 } // vextractf128 + 2*pshufb
663+
// + vinsertf128
664+
};
665+
666+
if (ST->hasAVX())
667+
if (const auto *Entry =
668+
CostTableLookup(AVX1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
669+
return LT.first * Entry->Cost;
670+
671+
static const CostTblEntry SSSE3ShuffleTbl[] = {
672+
{ ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // pshufb
673+
{ ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 } // pshufb
674+
};
675+
676+
if (ST->hasSSSE3())
677+
if (const auto *Entry =
678+
CostTableLookup(SSSE3ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
679+
return LT.first * Entry->Cost;
680+
681+
static const CostTblEntry SSE2ShuffleTbl[] = {
682+
{ ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // shufpd
683+
{ ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // pshufd
684+
{ ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 }, // pshufd
685+
{ ISD::VECTOR_SHUFFLE, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
686+
{ ISD::VECTOR_SHUFFLE, MVT::v16i8, 9 } // 2*pshuflw + 2*pshufhw
687+
// + 2*pshufd + 2*unpck + packus
688+
};
689+
690+
if (ST->hasSSE2())
691+
if (const auto *Entry =
692+
CostTableLookup(SSE2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
693+
return LT.first * Entry->Cost;
694+
695+
static const CostTblEntry SSE1ShuffleTbl[] = {
696+
{ ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // shufps
697+
};
698+
699+
if (ST->hasSSE1())
700+
if (const auto *Entry =
701+
CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
702+
return LT.first * Entry->Cost;
613703
}
614704

615705
if (Kind == TTI::SK_Alternate) {

llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll

Lines changed: 56 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,15 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
1818
%V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 0>
1919

2020
; SSE: cost of 2 {{.*}} %V256 = shufflevector
21-
; AVX: cost of 3 {{.*}} %V256 = shufflevector
22-
; AVX512: cost of 3 {{.*}} %V256 = shufflevector
21+
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
22+
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
23+
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
2324
%V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2425

2526
; SSE: cost of 4 {{.*}} %V512 = shufflevector
26-
; AVX: cost of 6 {{.*}} %V512 = shufflevector
27-
; AVX512: cost of 3 {{.*}} %V512 = shufflevector
27+
; AVX1: cost of 4 {{.*}} %V512 = shufflevector
28+
; AVX2: cost of 2 {{.*}} %V512 = shufflevector
29+
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
2830
%V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
2931

3032
ret void
@@ -38,13 +40,15 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
3840
%V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
3941

4042
; SSE: cost of 2 {{.*}} %V256 = shufflevector
41-
; AVX: cost of 3 {{.*}} %V256 = shufflevector
42-
; AVX512: cost of 3 {{.*}} %V256 = shufflevector
43+
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
44+
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
45+
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
4346
%V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
4447

4548
; SSE: cost of 4 {{.*}} %V512 = shufflevector
46-
; AVX: cost of 6 {{.*}} %V512 = shufflevector
47-
; AVX512: cost of 3 {{.*}} %V512 = shufflevector
49+
; AVX1: cost of 4 {{.*}} %V512 = shufflevector
50+
; AVX2: cost of 2 {{.*}} %V512 = shufflevector
51+
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
4852
%V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
4953

5054
ret void
@@ -63,13 +67,15 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
6367
%V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
6468

6569
; SSE: cost of 2 {{.*}} %V256 = shufflevector
66-
; AVX: cost of 3 {{.*}} %V256 = shufflevector
67-
; AVX512: cost of 3 {{.*}} %V256 = shufflevector
70+
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
71+
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
72+
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
6873
%V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
6974

7075
; SSE: cost of 4 {{.*}} %V512 = shufflevector
71-
; AVX: cost of 6 {{.*}} %V512 = shufflevector
72-
; AVX512: cost of 3 {{.*}} %V512 = shufflevector
76+
; AVX1: cost of 4 {{.*}} %V512 = shufflevector
77+
; AVX2: cost of 2 {{.*}} %V512 = shufflevector
78+
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
7379
%V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
7480

7581
ret void
@@ -88,55 +94,73 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
8894
%V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
8995

9096
; SSE: cost of 2 {{.*}} %V256 = shufflevector
91-
; AVX: cost of 3 {{.*}} %V256 = shufflevector
92-
; AVX512: cost of 3 {{.*}} %V256 = shufflevector
97+
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
98+
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
99+
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
93100
%V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
94101

95102
; SSE: cost of 4 {{.*}} %V512 = shufflevector
96-
; AVX: cost of 6 {{.*}} %V512 = shufflevector
97-
; AVX512: cost of 3 {{.*}} %V512 = shufflevector
103+
; AVX1: cost of 4 {{.*}} %V512 = shufflevector
104+
; AVX2: cost of 2 {{.*}} %V512 = shufflevector
105+
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
98106
%V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
99107

100108
ret void
101109
}
102110

103111
; CHECK-LABEL: 'test_vXi16'
104112
define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
105-
; SSE: cost of 1 {{.*}} %V128 = shufflevector
113+
; SSE2: cost of 3 {{.*}} %V128 = shufflevector
114+
; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
115+
; SSE42: cost of 1 {{.*}} %V128 = shufflevector
106116
; AVX: cost of 1 {{.*}} %V128 = shufflevector
107117
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
108118
%V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
109119

110-
; SSE: cost of 2 {{.*}} %V256 = shufflevector
111-
; AVX: cost of 3 {{.*}} %V256 = shufflevector
112-
; AVX512: cost of 3 {{.*}} %V256 = shufflevector
120+
; SSE2: cost of 6 {{.*}} %V256 = shufflevector
121+
; SSSE3: cost of 2 {{.*}} %V256 = shufflevector
122+
; SSE42: cost of 2 {{.*}} %V256 = shufflevector
123+
; AVX1: cost of 4 {{.*}} %V256 = shufflevector
124+
; AVX2: cost of 2 {{.*}} %V256 = shufflevector
125+
; AVX512: cost of 2 {{.*}} %V256 = shufflevector
113126
%V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
114127

115-
; SSE: cost of 4 {{.*}} %V512 = shufflevector
116-
; AVX: cost of 6 {{.*}} %V512 = shufflevector
117-
; AVX512F: cost of 6 {{.*}} %V512 = shufflevector
118-
; AVX512BW: cost of 3 {{.*}} %V512 = shufflevector
128+
; SSE2: cost of 12 {{.*}} %V512 = shufflevector
129+
; SSSE3: cost of 4 {{.*}} %V512 = shufflevector
130+
; SSE42: cost of 4 {{.*}} %V512 = shufflevector
131+
; AVX1: cost of 8 {{.*}} %V512 = shufflevector
132+
; AVX2: cost of 4 {{.*}} %V512 = shufflevector
133+
; AVX512F: cost of 4 {{.*}} %V512 = shufflevector
134+
; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
119135
%V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
120136

121137
ret void
122138
}
123139

124140
; CHECK-LABEL: 'test_vXi8'
125141
define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
126-
; SSE: cost of 1 {{.*}} %V128 = shufflevector
142+
; SSE2: cost of 9 {{.*}} %V128 = shufflevector
143+
; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
144+
; SSE42: cost of 1 {{.*}} %V128 = shufflevector
127145
; AVX: cost of 1 {{.*}} %V128 = shufflevector
128146
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
129147
%V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
130148

131-
; SSE: cost of 2 {{.*}} %V256 = shufflevector
132-
; AVX: cost of 3 {{.*}} %V256 = shufflevector
133-
; AVX512: cost of 3 {{.*}} %V256 = shufflevector
149+
; SSE2: cost of 18 {{.*}} %V256 = shufflevector
150+
; SSSE3: cost of 2 {{.*}} %V256 = shufflevector
151+
; SSE42: cost of 2 {{.*}} %V256 = shufflevector
152+
; AVX1: cost of 4 {{.*}} %V256 = shufflevector
153+
; AVX2: cost of 2 {{.*}} %V256 = shufflevector
154+
; AVX512: cost of 2 {{.*}} %V256 = shufflevector
134155
%V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
135156

136-
; SSE: cost of 4 {{.*}} %V512 = shufflevector
137-
; AVX: cost of 6 {{.*}} %V512 = shufflevector
138-
; AVX512F: cost of 6 {{.*}} %V512 = shufflevector
139-
; AVX512BW: cost of 3 {{.*}} %V512 = shufflevector
157+
; SSE2: cost of 36 {{.*}} %V512 = shufflevector
158+
; SSSE3: cost of 4 {{.*}} %V512 = shufflevector
159+
; SSE42: cost of 4 {{.*}} %V512 = shufflevector
160+
; AVX1: cost of 8 {{.*}} %V512 = shufflevector
161+
; AVX2: cost of 4 {{.*}} %V512 = shufflevector
162+
; AVX512F: cost of 4 {{.*}} %V512 = shufflevector
163+
; AVX512BW: cost of 6 {{.*}} %V512 = shufflevector
140164
%V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
141165

142166
ret void

0 commit comments

Comments
 (0)