-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[X86] Fold blend(pshufb(x,m1),pshufb(y,m2)) -> blend(pshufb(x,blend(m1,m2)),pshufb(y,blend(m1,m2))) to reduce constant pool #98466
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesShare PSHUFB masks where we have no overlap in used elements. Fixes #98346 Patch is 1.24 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/98466.diff 23 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ea916b778d86d..92e2c75892754 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41016,23 +41016,59 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
case X86ISD::BLENDI: {
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
-
- // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
- // TODO: Handle MVT::v16i16 repeated blend mask.
- if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
- N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
- MVT SrcVT = N0.getOperand(0).getSimpleValueType();
- if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
- SrcVT.getScalarSizeInBits() >= 32) {
- unsigned Size = VT.getVectorNumElements();
- unsigned NewSize = SrcVT.getVectorNumElements();
- APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
- APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
- return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
- N1.getOperand(0),
- DAG.getTargetConstant(NewBlendMask.getZExtValue(),
- DL, MVT::i8)));
+ unsigned EltBits = VT.getScalarSizeInBits();
+
+ if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
+ // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
+ // TODO: Handle MVT::v16i16 repeated blend mask.
+ if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
+ MVT SrcVT = N0.getOperand(0).getSimpleValueType();
+ unsigned SrcBits = SrcVT.getScalarSizeInBits();
+ if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
+ unsigned Size = VT.getVectorNumElements();
+ unsigned NewSize = SrcVT.getVectorNumElements();
+ APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
+ APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
+ N1.getOperand(0),
+ DAG.getTargetConstant(NewBlendMask.getZExtValue(),
+ DL, MVT::i8)));
+ }
+ }
+ // Share PSHUFB masks:
+ // blend(pshufb(x,m1),pshufb(y,m2))
+ // --> m3 = blend(m1,m2)
+ // blend(pshufb(x,m3),pshufb(y,m3))
+ if (N0.hasOneUse() && N1.hasOneUse()) {
+ SmallVector<int> Mask, ByteMask;
+ SmallVector<SDValue> Ops;
+ SDValue LHS = peekThroughOneUseBitcasts(N0);
+ SDValue RHS = peekThroughOneUseBitcasts(N1);
+ if (LHS.getOpcode() == X86ISD::PSHUFB &&
+ RHS.getOpcode() == X86ISD::PSHUFB &&
+ LHS.getOperand(1) != RHS.getOperand(1) &&
+ (LHS.getOperand(1).hasOneUse() || RHS.getOperand(1).hasOneUse()) &&
+ getTargetShuffleMask(N, false, Ops, Mask)) {
+ assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
+ RHS == peekThroughOneUseBitcasts(Ops[1]) &&
+ "BLENDI decode mismatch");
+ MVT ShufVT = LHS.getSimpleValueType();
+ SDValue MaskLHS = LHS.getOperand(1);
+ SDValue MaskRHS = RHS.getOperand(1);
+ llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
+ if (SDValue NewMask = combineX86ShufflesConstants(
+ ShufVT, {MaskLHS, MaskRHS}, ByteMask, true, DAG, DL,
+ Subtarget)) {
+ SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
+ LHS.getOperand(0), NewMask);
+ SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
+ RHS.getOperand(0), NewMask);
+ return DAG.getNode(X86ISD::BLENDI, DL, VT,
+ DAG.getBitcast(VT, NewLHS),
+ DAG.getBitcast(VT, NewRHS), N.getOperand(2));
+ }
+ }
}
}
return SDValue();
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index d3a3b1e980db0..b40b2c82843cc 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1294,10 +1294,11 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-SLOW-NEXT: vmovdqu (%rdx), %xmm1
; AVX2-SLOW-NEXT: vmovdqu (%rcx), %xmm2
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,2,3,2,3,8,9,4,5,4,5,16,17,6,7,22,23,18,19,8,9,24,25,20,21,10,11]
+; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm5
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
+; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4
; AVX2-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
@@ -1339,10 +1340,11 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rdx), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rcx), %xmm2
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,2,3,2,3,8,9,4,5,4,5,16,17,6,7,22,23,18,19,8,9,24,25,20,21,10,11]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm5
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4
; AVX2-FAST-PERLANE-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
index e94f51233256c..45842d4148a8b 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
@@ -12,8 +12,9 @@ define void @shuffle_v64i8_to_v32i8_1(ptr %L, ptr %S) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 95e249984e184..cf0820aac3262 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -20,8 +20,9 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
@@ -44,8 +45,9 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
+; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index 3bc97f71f04fb..00e43df15deea 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -488,8 +488,9 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vmovaps %ymm2, (%rsi)
@@ -506,8 +507,9 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm2
; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi)
@@ -524,8 +526,9 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm2
; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi)
@@ -736,14 +739,13 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vmovaps %ymm5, (%rsi)
@@ -768,14 +770,13 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm4
; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6]
; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX2-FP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi)
@@ -800,14 +801,13 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm4
; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi)
@@ -1180,20 +1180,20 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX2-NEXT: vmovdqa 224(%rdi), %ymm6
; AVX2-NEXT: vmovdqa (%rdi), %ymm3
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm7
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm8
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm2[0,2],ymm7[4,6],ymm2[4,6]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,2],ymm2[0,2],ymm9[4,6],ymm2[4,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm10[0,2],ymm7[0,2],ymm10[4,6],ymm7[4,6]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,1,3]
+; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,2],ymm9[0,2],ymm10[4,6],ymm9[4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
@@ -1206,32 +1206,31 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX2-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
-; AVX2-NEXT: vpshufb %ymm12, %ymm9, %ymm9
-; AVX2-NEXT: vmovd...
[truncated]
|
ping? |
75090e7
to
32655bd
Compare
…1,m2)),pshufb(y,blend(m1,m2))) to reduce constant pool Share PSHUFB masks where we have no overlap in used elements.
32655bd
to
42c45d9
Compare
LGTM assuming buildkite tests pass. Please wait a day to push so others have a chance to review. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…1,m2)),pshufb(y,blend(m1,m2))) to reduce constant pool (#98466) Summary: Share PSHUFB masks where we have no overlap in used elements. Fixes #98346 Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250984
Share PSHUFB masks where we have no overlap in used elements.
Fixes #98346