Skip to content

Commit de5cbe5

Browse files
RKSimonyuxuanchen1997
authored andcommitted
[X86] Fold blend(pshufb(x,m1),pshufb(y,m2)) -> blend(pshufb(x,blend(m1,m2)),pshufb(y,blend(m1,m2))) to reduce constant pool (#98466)
Summary: Share PSHUFB masks where we have no overlap in used elements. Fixes #98346 Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250984
1 parent 934b968 commit de5cbe5

23 files changed

+4936
-4771
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+53-17
Original file line numberDiff line numberDiff line change
@@ -41024,23 +41024,59 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4102441024
case X86ISD::BLENDI: {
4102541025
SDValue N0 = N.getOperand(0);
4102641026
SDValue N1 = N.getOperand(1);
41027-
41028-
// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
41029-
// TODO: Handle MVT::v16i16 repeated blend mask.
41030-
if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
41031-
N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
41032-
MVT SrcVT = N0.getOperand(0).getSimpleValueType();
41033-
if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
41034-
SrcVT.getScalarSizeInBits() >= 32) {
41035-
unsigned Size = VT.getVectorNumElements();
41036-
unsigned NewSize = SrcVT.getVectorNumElements();
41037-
APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
41038-
APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
41039-
return DAG.getBitcast(
41040-
VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
41041-
N1.getOperand(0),
41042-
DAG.getTargetConstant(NewBlendMask.getZExtValue(),
41043-
DL, MVT::i8)));
41027+
unsigned EltBits = VT.getScalarSizeInBits();
41028+
41029+
if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
41030+
// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
41031+
// TODO: Handle MVT::v16i16 repeated blend mask.
41032+
if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
41033+
MVT SrcVT = N0.getOperand(0).getSimpleValueType();
41034+
unsigned SrcBits = SrcVT.getScalarSizeInBits();
41035+
if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
41036+
unsigned Size = VT.getVectorNumElements();
41037+
unsigned NewSize = SrcVT.getVectorNumElements();
41038+
APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
41039+
APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
41040+
return DAG.getBitcast(
41041+
VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
41042+
N1.getOperand(0),
41043+
DAG.getTargetConstant(NewBlendMask.getZExtValue(),
41044+
DL, MVT::i8)));
41045+
}
41046+
}
41047+
// Share PSHUFB masks:
41048+
// blend(pshufb(x,m1),pshufb(y,m2))
41049+
// --> m3 = blend(m1,m2)
41050+
// blend(pshufb(x,m3),pshufb(y,m3))
41051+
if (N0.hasOneUse() && N1.hasOneUse()) {
41052+
SmallVector<int> Mask, ByteMask;
41053+
SmallVector<SDValue> Ops;
41054+
SDValue LHS = peekThroughOneUseBitcasts(N0);
41055+
SDValue RHS = peekThroughOneUseBitcasts(N1);
41056+
if (LHS.getOpcode() == X86ISD::PSHUFB &&
41057+
RHS.getOpcode() == X86ISD::PSHUFB &&
41058+
LHS.getOperand(1) != RHS.getOperand(1) &&
41059+
LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
41060+
getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
41061+
assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
41062+
RHS == peekThroughOneUseBitcasts(Ops[1]) &&
41063+
"BLENDI decode mismatch");
41064+
MVT ShufVT = LHS.getSimpleValueType();
41065+
SDValue MaskLHS = LHS.getOperand(1);
41066+
SDValue MaskRHS = RHS.getOperand(1);
41067+
llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
41068+
if (SDValue NewMask = combineX86ShufflesConstants(
41069+
ShufVT, {MaskLHS, MaskRHS}, ByteMask,
41070+
/*HasVariableMask=*/true, DAG, DL, Subtarget)) {
41071+
SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
41072+
LHS.getOperand(0), NewMask);
41073+
SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
41074+
RHS.getOperand(0), NewMask);
41075+
return DAG.getNode(X86ISD::BLENDI, DL, VT,
41076+
DAG.getBitcast(VT, NewLHS),
41077+
DAG.getBitcast(VT, NewRHS), N.getOperand(2));
41078+
}
41079+
}
4104441080
}
4104541081
}
4104641082
return SDValue();

llvm/test/CodeGen/X86/oddshuffles.ll

+8-6
Original file line numberDiff line numberDiff line change
@@ -1294,10 +1294,11 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
12941294
; AVX2-SLOW-NEXT: vmovdqu (%rdx), %xmm1
12951295
; AVX2-SLOW-NEXT: vmovdqu (%rcx), %xmm2
12961296
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1297-
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
1297+
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,2,3,2,3,8,9,4,5,4,5,16,17,6,7,22,23,18,19,8,9,24,25,20,21,10,11]
1298+
; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm5
12981299
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
1299-
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
1300-
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
1300+
; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1301+
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
13011302
; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
13021303
; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4
13031304
; AVX2-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
@@ -1339,10 +1340,11 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
13391340
; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rdx), %xmm1
13401341
; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rcx), %xmm2
13411342
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1342-
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
1343+
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,2,3,2,3,8,9,4,5,4,5,16,17,6,7,22,23,18,19,8,9,24,25,20,21,10,11]
1344+
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm5
13431345
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
1344-
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
1345-
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
1346+
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1347+
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
13461348
; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
13471349
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4
13481350
; AVX2-FAST-PERLANE-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]

llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll

+3-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@ define void @shuffle_v64i8_to_v32i8_1(ptr %L, ptr %S) nounwind {
1212
; AVX512F: # %bb.0:
1313
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1414
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
15-
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
16-
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
15+
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
16+
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
17+
; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1718
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1819
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1920
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)

llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll

+6-4
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
2020
; AVX512F: # %bb.0:
2121
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
2222
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
23-
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
24-
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
23+
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
24+
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
25+
; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2526
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2627
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2728
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
@@ -44,8 +45,9 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
4445
; AVX512VL-FAST-PERLANE: # %bb.0:
4546
; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
4647
; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
47-
; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
48-
; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
48+
; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
49+
; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
50+
; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4951
; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
5052
; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5153
; AVX512VL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi)

0 commit comments

Comments
 (0)