Skip to content

Commit 4f93327

Browse files
authored
[CostModel][X86] Improve cost estimation of insert_subvector shuffle patterns of legalized types (#119363)
In cases where the base/sub vector type in an insert_subvector pattern legalize to the same width through splitting, we can assume that the shuffle becomes free as the legalized vectors will not overlap. Note this isn't true if the vectors have been widened during legalization (e.g. v2f32 insertion into v4f32 would legalize to v4f32 into v4f32). Noticed while working on adding processShuffleMasks handling for SK_PermuteTwoSrc.
1 parent ed91843 commit 4f93327

11 files changed

+403
-392
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1630,14 +1630,19 @@ InstructionCost X86TTIImpl::getShuffleCost(
16301630

16311631
// Subvector insertions are cheap if the subvectors are aligned.
16321632
// Note that in general, the insertion starting at the beginning of a vector
1633-
// isn't free, because we need to preserve the rest of the wide vector.
1633+
// isn't free, because we need to preserve the rest of the wide vector,
1634+
// but if the destination vector legalizes to the same width as the subvector
1635+
// then the insertion will simplify to a (free) register copy.
16341636
if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
16351637
int NumElts = LT.second.getVectorNumElements();
16361638
std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
16371639
if (SubLT.second.isVector()) {
16381640
int NumSubElts = SubLT.second.getVectorNumElements();
1641+
bool MatchingTypes =
1642+
NumElts == NumSubElts &&
1643+
(SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
16391644
if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1640-
return SubLT.first;
1645+
return MatchingTypes ? TTI::TCC_Free : SubLT.first;
16411646
}
16421647

16431648
// If the insertion isn't aligned, treat it like a 2-op shuffle.

llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll

Lines changed: 42 additions & 42 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll

Lines changed: 42 additions & 42 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll

Lines changed: 42 additions & 42 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll

Lines changed: 42 additions & 42 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll

Lines changed: 49 additions & 49 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll

Lines changed: 49 additions & 49 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll

Lines changed: 49 additions & 49 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll

Lines changed: 49 additions & 49 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,18 @@ define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
6464
}
6565

6666
define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) {
67-
; CHECK-LABEL: define <8 x i32> @concat_sext_v4i1_v8i32(
68-
; CHECK-SAME: <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]]) #[[ATTR0]] {
69-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[A0]], <4 x i1> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
70-
; CHECK-NEXT: [[R:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i32>
71-
; CHECK-NEXT: ret <8 x i32> [[R]]
67+
; SSE-LABEL: define <8 x i32> @concat_sext_v4i1_v8i32(
68+
; SSE-SAME: <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]]) #[[ATTR0]] {
69+
; SSE-NEXT: [[X0:%.*]] = sext <4 x i1> [[A0]] to <4 x i32>
70+
; SSE-NEXT: [[X1:%.*]] = sext <4 x i1> [[A1]] to <4 x i32>
71+
; SSE-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
72+
; SSE-NEXT: ret <8 x i32> [[R]]
73+
;
74+
; AVX-LABEL: define <8 x i32> @concat_sext_v4i1_v8i32(
75+
; AVX-SAME: <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]]) #[[ATTR0]] {
76+
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[A0]], <4 x i1> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
77+
; AVX-NEXT: [[R:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i32>
78+
; AVX-NEXT: ret <8 x i32> [[R]]
7279
;
7380
%x0 = sext <4 x i1> %a0 to <4 x i32>
7481
%x1 = sext <4 x i1> %a1 to <4 x i32>
@@ -90,11 +97,18 @@ define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) {
9097
}
9198

9299
define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) {
93-
; CHECK-LABEL: define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(
94-
; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] {
95-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
96-
; CHECK-NEXT: [[R:%.*]] = inttoptr <8 x i32> [[TMP1]] to <8 x ptr>
97-
; CHECK-NEXT: ret <8 x ptr> [[R]]
100+
; SSE-LABEL: define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(
101+
; SSE-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] {
102+
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
103+
; SSE-NEXT: [[R:%.*]] = inttoptr <8 x i32> [[TMP1]] to <8 x ptr>
104+
; SSE-NEXT: ret <8 x ptr> [[R]]
105+
;
106+
; AVX-LABEL: define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(
107+
; AVX-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] {
108+
; AVX-NEXT: [[X0:%.*]] = inttoptr <4 x i32> [[A0]] to <4 x ptr>
109+
; AVX-NEXT: [[X1:%.*]] = inttoptr <4 x i32> [[A1]] to <4 x ptr>
110+
; AVX-NEXT: [[R:%.*]] = shufflevector <4 x ptr> [[X0]], <4 x ptr> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
111+
; AVX-NEXT: ret <8 x ptr> [[R]]
98112
;
99113
%x0 = inttoptr <4 x i32> %a0 to <4 x ptr>
100114
%x1 = inttoptr <4 x i32> %a1 to <4 x ptr>
@@ -138,9 +152,8 @@ define <8 x double> @concat_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1)
138152
define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double> %a1) {
139153
; CHECK-LABEL: define <16 x float> @concat_fptrunc_v8f64_v16f32(
140154
; CHECK-SAME: <8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]]) #[[ATTR0]] {
141-
; CHECK-NEXT: [[X0:%.*]] = fptrunc <8 x double> [[A0]] to <8 x float>
142-
; CHECK-NEXT: [[X1:%.*]] = fptrunc <8 x double> [[A1]] to <8 x float>
143-
; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x float> [[X0]], <8 x float> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
155+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0]], <8 x double> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
156+
; CHECK-NEXT: [[R:%.*]] = fptrunc <16 x double> [[TMP1]] to <16 x float>
144157
; CHECK-NEXT: ret <16 x float> [[R]]
145158
;
146159
%x0 = fptrunc <8 x double> %a0 to <8 x float>

llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,12 @@ entry:
3434
}
3535

3636
define <8 x i32> @test3(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
37-
; SSE-LABEL: @test3(
38-
; SSE-NEXT: entry:
39-
; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]])
40-
; SSE-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
41-
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
42-
; SSE-NEXT: ret <8 x i32> [[TMP6]]
43-
;
44-
; AVX-LABEL: @test3(
45-
; AVX-NEXT: entry:
46-
; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
47-
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> [[TMP3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
48-
; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
49-
; AVX-NEXT: ret <8 x i32> [[TMP6]]
37+
; CHECK-LABEL: @test3(
38+
; CHECK-NEXT: entry:
39+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
40+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> [[TMP3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
41+
; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
42+
; CHECK-NEXT: ret <8 x i32> [[TMP6]]
5043
;
5144
entry:
5245
%4 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1)

0 commit comments

Comments
 (0)