Skip to content

Commit 505fac3

Browse files
RKSimonchencha3
authored andcommitted
[X86] getShuffleCost - recognise concat_vector(X,Y) shuffle as InsertSubvector instead of PermuteTwoSrc
We don't have a concat_vector shuffle kind and improveShuffleKindFromMask won't alter the base type to match it as InsertSubvector. But since this is how X86 will lower concat_vector anyhow, just recognise it explicitly. Another step for llvm#67803
1 parent 7cff40f commit 505fac3

File tree

2 files changed

+14
-37
lines changed

2 files changed

+14
-37
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1480,6 +1480,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
14801480

14811481
Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
14821482

1483+
// Recognize a basic concat_vector shuffle.
1484+
if (Kind == TTI::SK_PermuteTwoSrc &&
1485+
Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1486+
ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1487+
return getShuffleCost(TTI::SK_InsertSubvector,
1488+
VectorType::getDoubleElementsVectorType(BaseTp), Mask,
1489+
CostKind, Mask.size() / 2, BaseTp);
1490+
14831491
// Treat Transpose as 2-op shuffles - there's no difference in lowering.
14841492
if (Kind == TTI::SK_Transpose)
14851493
Kind = TTI::SK_PermuteTwoSrc;

llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll

Lines changed: 6 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK
3-
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK
4-
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
2+
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s
3+
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s
4+
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s
55

66
define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b) {
77
; CHECK-LABEL: @PR67803(
@@ -11,59 +11,28 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
1111
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
1212
; CHECK-NEXT: [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1313
; CHECK-NEXT: [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
14-
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[SEXT_I22]] to <2 x i64>
1514
; CHECK-NEXT: [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1615
; CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
17-
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[SEXT_I]] to <2 x i64>
18-
; CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
16+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1917
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
2018
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2119
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
2220
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
23-
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64> [[SHUFFLE_I]] to <32 x i8>
21+
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
2422
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP9]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2523
; CHECK-NEXT: [[TMP11:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP6]], <16 x i8> [[TMP8]], <16 x i8> [[TMP10]])
2624
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
2725
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i64> [[A]] to <32 x i8>
2826
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2927
; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i64> [[B]] to <32 x i8>
3028
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
31-
; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i64> [[SHUFFLE_I]] to <32 x i8>
29+
; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
3230
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
3331
; CHECK-NEXT: [[TMP19:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP14]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]])
3432
; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i8> [[TMP19]] to <2 x i64>
3533
; CHECK-NEXT: [[SHUFFLE_I23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3634
; CHECK-NEXT: ret <4 x i64> [[SHUFFLE_I23]]
3735
;
38-
; AVX512-LABEL: @PR67803(
39-
; AVX512-NEXT: entry:
40-
; AVX512-NEXT: [[TMP0:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
41-
; AVX512-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32>
42-
; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
43-
; AVX512-NEXT: [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
44-
; AVX512-NEXT: [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
45-
; AVX512-NEXT: [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
46-
; AVX512-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
47-
; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
48-
; AVX512-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
49-
; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
50-
; AVX512-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
51-
; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
52-
; AVX512-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
53-
; AVX512-NEXT: [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
54-
; AVX512-NEXT: [[TMP10:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i8> [[TMP9]])
55-
; AVX512-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
56-
; AVX512-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[A]] to <32 x i8>
57-
; AVX512-NEXT: [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
58-
; AVX512-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[B]] to <32 x i8>
59-
; AVX512-NEXT: [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
60-
; AVX512-NEXT: [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
61-
; AVX512-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
62-
; AVX512-NEXT: [[TMP18:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP13]], <16 x i8> [[TMP15]], <16 x i8> [[TMP17]])
63-
; AVX512-NEXT: [[TMP19:%.*]] = bitcast <16 x i8> [[TMP18]] to <2 x i64>
64-
; AVX512-NEXT: [[SHUFFLE_I23:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
65-
; AVX512-NEXT: ret <4 x i64> [[SHUFFLE_I23]]
66-
;
6736
entry:
6837
%0 = bitcast <4 x i64> %x to <8 x i32>
6938
%extract = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

0 commit comments

Comments
 (0)