Skip to content

Commit 0b4ee8d

Browse files
authored
[X86] combineKSHIFT - fold kshiftr(kshiftr/extract_subvector(X,C1),C2) --> kshiftr(X,C1+C2) (#115528)
Merge serial KSHIFTR nodes, possibly separated by EXTRACT_SUBVECTOR, to allow mask instructions to be computed in parallel.
1 parent 1ee740a commit 0b4ee8d

File tree

9 files changed

+208
-189
lines changed

9 files changed

+208
-189
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58706,11 +58706,30 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
5870658706
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
5870758707
TargetLowering::DAGCombinerInfo &DCI) {
5870858708
EVT VT = N->getValueType(0);
58709-
58709+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5871058710
if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
5871158711
return DAG.getConstant(0, SDLoc(N), VT);
5871258712

58713-
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58713+
// Fold kshiftr(extract_subvector(X,C1),C2)
58714+
// --> extract_subvector(kshiftr(X,C1+C2),0)
58715+
// Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
58716+
if (N->getOpcode() == X86ISD::KSHIFTR) {
58717+
SDLoc DL(N);
58718+
if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
58719+
N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
58720+
SDValue Src = N->getOperand(0).getOperand(0);
58721+
uint64_t Amt = N->getConstantOperandVal(1) +
58722+
N->getOperand(0).getConstantOperandVal(1);
58723+
EVT SrcVT = Src.getValueType();
58724+
if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
58725+
SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
58726+
DAG.getTargetConstant(Amt, DL, MVT::i8));
58727+
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
58728+
DAG.getIntPtrConstant(0, DL));
58729+
}
58730+
}
58731+
}
58732+
5871458733
APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
5871558734
if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
5871658735
return SDValue(N, 0);

llvm/test/CodeGen/X86/avx512-bugfix-26264.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double>
77
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
88
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
99
; AVX512BW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
10-
; AVX512BW-NEXT: kshiftrw $8, %k1, %k2
10+
; AVX512BW-NEXT: kshiftrd $8, %k1, %k2
1111
; AVX512BW-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2}
12-
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
13-
; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1}
14-
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
12+
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
13+
; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2}
14+
; AVX512BW-NEXT: kshiftrd $24, %k1, %k1
1515
; AVX512BW-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
1616
; AVX512BW-NEXT: retq
1717
%res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
@@ -24,11 +24,11 @@ define <32 x i64> @test_load_32i64(ptr %ptrs, <32 x i1> %mask, <32 x i64> %src0)
2424
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
2525
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
2626
; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
27-
; AVX512BW-NEXT: kshiftrw $8, %k1, %k2
27+
; AVX512BW-NEXT: kshiftrd $8, %k1, %k2
2828
; AVX512BW-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k2}
29-
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
30-
; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k1}
31-
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
29+
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
30+
; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k2}
31+
; AVX512BW-NEXT: kshiftrd $24, %k1, %k1
3232
; AVX512BW-NEXT: vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1}
3333
; AVX512BW-NEXT: retq
3434
%res = call <32 x i64> @llvm.masked.load.v32i64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)

llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -261,11 +261,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double>
261261
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
262262
; SKX-NEXT: vpmovb2m %ymm0, %k1
263263
; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
264-
; SKX-NEXT: kshiftrw $8, %k1, %k2
264+
; SKX-NEXT: kshiftrd $8, %k1, %k2
265265
; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2}
266-
; SKX-NEXT: kshiftrd $16, %k1, %k1
267-
; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1}
268-
; SKX-NEXT: kshiftrw $8, %k1, %k1
266+
; SKX-NEXT: kshiftrd $16, %k1, %k2
267+
; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2}
268+
; SKX-NEXT: kshiftrd $24, %k1, %k1
269269
; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
270270
; SKX-NEXT: retq
271271
%res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)

llvm/test/CodeGen/X86/pr33349.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,23 @@ target triple = "x86_64-unknown-linux-gnu"
1717
; KNL-NEXT: fldz
1818
; KNL-NEXT: fld %st(0)
1919
; KNL-NEXT: fcmovne %st(2), %st
20-
; KNL-NEXT: testb $2, %al
21-
; KNL-NEXT: fld %st(1)
22-
; KNL-NEXT: fcmovne %st(3), %st
2320
; KNL-NEXT: kmovw %k0, %eax
2421
; KNL-NEXT: testb $1, %al
22+
; KNL-NEXT: fld %st(1)
23+
; KNL-NEXT: fcmovne %st(3), %st
24+
; KNL-NEXT: testb $2, %al
2525
; KNL-NEXT: fld %st(2)
2626
; KNL-NEXT: fcmovne %st(4), %st
27-
; KNL-NEXT: testb $2, %al
27+
; KNL-NEXT: testb $8, %al
2828
; KNL-NEXT: fxch %st(3)
2929
; KNL-NEXT: fcmovne %st(4), %st
3030
; KNL-NEXT: fstp %st(4)
3131
; KNL-NEXT: fxch %st(3)
32+
; KNL-NEXT: fstpt 30(%rdi)
33+
; KNL-NEXT: fxch %st(1)
3234
; KNL-NEXT: fstpt 10(%rdi)
3335
; KNL-NEXT: fxch %st(1)
3436
; KNL-NEXT: fstpt (%rdi)
35-
; KNL-NEXT: fxch %st(1)
36-
; KNL-NEXT: fstpt 30(%rdi)
3737
; KNL-NEXT: fstpt 20(%rdi)
3838
; KNL-NEXT: vzeroupper
3939
; KNL-NEXT: retq
@@ -49,23 +49,23 @@ target triple = "x86_64-unknown-linux-gnu"
4949
; SKX-NEXT: fldz
5050
; SKX-NEXT: fld %st(0)
5151
; SKX-NEXT: fcmovne %st(2), %st
52-
; SKX-NEXT: testb $2, %al
53-
; SKX-NEXT: fld %st(1)
54-
; SKX-NEXT: fcmovne %st(3), %st
5552
; SKX-NEXT: kmovd %k0, %eax
5653
; SKX-NEXT: testb $1, %al
54+
; SKX-NEXT: fld %st(1)
55+
; SKX-NEXT: fcmovne %st(3), %st
56+
; SKX-NEXT: testb $2, %al
5757
; SKX-NEXT: fld %st(2)
5858
; SKX-NEXT: fcmovne %st(4), %st
59-
; SKX-NEXT: testb $2, %al
59+
; SKX-NEXT: testb $8, %al
6060
; SKX-NEXT: fxch %st(3)
6161
; SKX-NEXT: fcmovne %st(4), %st
6262
; SKX-NEXT: fstp %st(4)
6363
; SKX-NEXT: fxch %st(3)
64+
; SKX-NEXT: fstpt 30(%rdi)
65+
; SKX-NEXT: fxch %st(1)
6466
; SKX-NEXT: fstpt 10(%rdi)
6567
; SKX-NEXT: fxch %st(1)
6668
; SKX-NEXT: fstpt (%rdi)
67-
; SKX-NEXT: fxch %st(1)
68-
; SKX-NEXT: fstpt 30(%rdi)
6969
; SKX-NEXT: fstpt 20(%rdi)
7070
; SKX-NEXT: retq
7171
bb:

llvm/test/CodeGen/X86/pr34177.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,18 +51,18 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
5151
; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
5252
; AVX512VL-NEXT: kshiftrb $2, %k0, %k1
5353
; AVX512VL-NEXT: kmovd %k0, %eax
54-
; AVX512VL-NEXT: testb $2, %al
54+
; AVX512VL-NEXT: testb $8, %al
5555
; AVX512VL-NEXT: fld1
5656
; AVX512VL-NEXT: fldz
5757
; AVX512VL-NEXT: fld %st(0)
5858
; AVX512VL-NEXT: fcmovne %st(2), %st
59-
; AVX512VL-NEXT: testb $1, %al
59+
; AVX512VL-NEXT: testb $2, %al
6060
; AVX512VL-NEXT: fld %st(1)
6161
; AVX512VL-NEXT: fcmovne %st(3), %st
62-
; AVX512VL-NEXT: kmovd %k1, %eax
63-
; AVX512VL-NEXT: testb $2, %al
62+
; AVX512VL-NEXT: testb $1, %al
6463
; AVX512VL-NEXT: fld %st(2)
6564
; AVX512VL-NEXT: fcmovne %st(4), %st
65+
; AVX512VL-NEXT: kmovd %k1, %eax
6666
; AVX512VL-NEXT: testb $1, %al
6767
; AVX512VL-NEXT: fxch %st(3)
6868
; AVX512VL-NEXT: fcmovne %st(4), %st
@@ -77,12 +77,12 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
7777
; AVX512VL-NEXT: fstpt 10(%rdi)
7878
; AVX512VL-NEXT: fxch %st(1)
7979
; AVX512VL-NEXT: fadd %st, %st(0)
80+
; AVX512VL-NEXT: fstpt 60(%rdi)
81+
; AVX512VL-NEXT: fadd %st, %st(0)
8082
; AVX512VL-NEXT: fstpt 20(%rdi)
8183
; AVX512VL-NEXT: fadd %st, %st(0)
8284
; AVX512VL-NEXT: fstpt (%rdi)
8385
; AVX512VL-NEXT: fadd %st, %st(0)
84-
; AVX512VL-NEXT: fstpt 60(%rdi)
85-
; AVX512VL-NEXT: fadd %st, %st(0)
8686
; AVX512VL-NEXT: fstpt 40(%rdi)
8787
%1 = icmp eq <4 x i64> <i64 0, i64 1, i64 2, i64 3>, %a
8888
%2 = select <4 x i1> %1, <4 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000>, <4 x x86_fp80> zeroinitializer

llvm/test/CodeGen/X86/vec_smulo.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2668,11 +2668,11 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
26682668
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
26692669
; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k1
26702670
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
2671-
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
2671+
; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
26722672
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
2673-
; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
2674-
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
2675-
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
2673+
; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
2674+
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
2675+
; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
26762676
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
26772677
; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi)
26782678
; AVX512BW-NEXT: retq

llvm/test/CodeGen/X86/vec_umulo.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2329,11 +2329,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
23292329
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
23302330
; AVX512BW-NEXT: vptestmb %zmm0, %zmm0, %k1
23312331
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
2332-
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
2332+
; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
23332333
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
2334-
; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
2335-
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
2336-
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
2334+
; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
2335+
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
2336+
; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
23372337
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
23382338
; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi)
23392339
; AVX512BW-NEXT: retq

llvm/test/CodeGen/X86/vector-compress.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -840,12 +840,12 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
840840
; AVX512VL-NEXT: subq $576, %rsp # imm = 0x240
841841
; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0
842842
; AVX512VL-NEXT: vpmovb2m %zmm0, %k1
843+
; AVX512VL-NEXT: kshiftrq $48, %k1, %k3
843844
; AVX512VL-NEXT: kshiftrq $32, %k1, %k4
844-
; AVX512VL-NEXT: kshiftrd $16, %k4, %k3
845-
; AVX512VL-NEXT: kshiftrd $16, %k1, %k2
845+
; AVX512VL-NEXT: kshiftrq $16, %k1, %k2
846846
; AVX512VL-NEXT: vpcompressd %zmm1, %zmm0 {%k1} {z}
847847
; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp)
848-
; AVX512VL-NEXT: kshiftrw $8, %k1, %k0
848+
; AVX512VL-NEXT: kshiftrq $8, %k1, %k0
849849
; AVX512VL-NEXT: kxorw %k0, %k1, %k0
850850
; AVX512VL-NEXT: kshiftrw $4, %k0, %k5
851851
; AVX512VL-NEXT: kxorw %k5, %k0, %k0
@@ -859,7 +859,7 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
859859
; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp,%rax,4)
860860
; AVX512VL-NEXT: vpcompressd %zmm3, %zmm0 {%k4} {z}
861861
; AVX512VL-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
862-
; AVX512VL-NEXT: kshiftrw $8, %k4, %k0
862+
; AVX512VL-NEXT: kshiftrq $40, %k1, %k0
863863
; AVX512VL-NEXT: kxorw %k0, %k4, %k0
864864
; AVX512VL-NEXT: kshiftrw $4, %k0, %k4
865865
; AVX512VL-NEXT: kxorw %k4, %k0, %k0

0 commit comments

Comments
 (0)