Skip to content

Commit 38ad0df

Browse files
authored
[X86] combineEXTRACT_SUBVECTOR - extract from a larger subvector insertion (#132950)
Fold EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2) -> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2) This extends the existing fold which required the extract/insert subvector indices to match - now it will always extract as long as the original inserted subvector is entirely contained within the extraction. Helps avoid unnecessary use of 512-bit vectors, and improves the chance of concatenation folds.
1 parent a308d42 commit 38ad0df

9 files changed

+109
-107
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -59085,20 +59085,22 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5908559085
return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
5908659086
}
5908759087

59088-
// If we are extracting from an insert into a larger vector, replace with a
59089-
// smaller insert if we don't access less than the original subvector. Don't
59090-
// do this for i1 vectors.
59091-
// TODO: Relax the matching indices requirement?
59092-
if (VT.getVectorElementType() != MVT::i1 &&
59093-
InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
59094-
IdxVal == InVec.getConstantOperandVal(2) &&
59095-
InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
59096-
SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59097-
InVec.getOperand(0), N->getOperand(1));
59098-
unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
59099-
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
59100-
InVec.getOperand(1),
59101-
DAG.getVectorIdxConstant(NewIdxVal, DL));
59088+
// EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59089+
// --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59090+
// iff SUB is entirely contained in the extraction.
59091+
if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59092+
InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59093+
SDValue Src = InVec.getOperand(0);
59094+
SDValue Sub = InVec.getOperand(1);
59095+
EVT SubVT = Sub.getValueType();
59096+
uint64_t InsIdx = InVec.getConstantOperandVal(2);
59097+
if (IdxVal <= InsIdx &&
59098+
(IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59099+
SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59100+
DAG.getVectorIdxConstant(IdxVal, DL));
59101+
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59102+
DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59103+
}
5910259104
}
5910359105

5910459106
// If we're extracting an upper subvector from a broadcast we should just

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -993,13 +993,13 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
993993
; AVX512-NEXT: vmovdqa (%r8), %xmm2
994994
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
995995
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
996-
; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
997996
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3]
998997
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27]
999998
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1000999
; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
10011000
; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm4
10021001
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1002+
; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
10031003
; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
10041004
; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
10051005
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
@@ -1035,7 +1035,6 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
10351035
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
10361036
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
10371037
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1038-
; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
10391038
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
10401039
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
10411040
; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
@@ -1044,6 +1043,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
10441043
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
10451044
; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4
10461045
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1046+
; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
10471047
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
10481048
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
10491049
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
@@ -1085,13 +1085,13 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
10851085
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
10861086
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
10871087
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1088-
; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
10891088
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3]
10901089
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27]
10911090
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3
10921091
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
10931092
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
10941093
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1094+
; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
10951095
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
10961096
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
10971097
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
@@ -1127,7 +1127,6 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
11271127
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
11281128
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
11291129
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1130-
; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
11311130
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
11321131
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
11331132
; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
@@ -1136,6 +1135,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
11361135
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
11371136
; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4
11381137
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1138+
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
11391139
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
11401140
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
11411141
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -740,14 +740,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
740740
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
741741
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
742742
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
743-
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
744-
; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
745743
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
746-
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
747744
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
745+
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
746+
; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
748747
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
749-
; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
750748
; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx)
749+
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
750+
; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
751751
; AVX512-NEXT: vmovdqa %ymm0, (%rcx)
752752
; AVX512-NEXT: vzeroupper
753753
; AVX512-NEXT: retq
@@ -763,14 +763,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
763763
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
764764
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
765765
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
766-
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
767-
; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
768766
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
769-
; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
770767
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
768+
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
769+
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
771770
; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
772-
; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
773771
; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
772+
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
773+
; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
774774
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
775775
; AVX512-FCP-NEXT: vzeroupper
776776
; AVX512-FCP-NEXT: retq
@@ -786,14 +786,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
786786
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
787787
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
788788
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
789-
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
790-
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
791789
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
792-
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
793790
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
791+
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
792+
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
794793
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2
795-
; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
796794
; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx)
795+
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
796+
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
797797
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
798798
; AVX512DQ-NEXT: vzeroupper
799799
; AVX512DQ-NEXT: retq
@@ -809,14 +809,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
809809
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
810810
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
811811
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
812-
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
813-
; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
814812
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
815-
; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
816813
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
814+
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
815+
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
817816
; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
818-
; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
819817
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
818+
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
819+
; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
820820
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
821821
; AVX512DQ-FCP-NEXT: vzeroupper
822822
; AVX512DQ-FCP-NEXT: retq
@@ -832,14 +832,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
832832
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
833833
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
834834
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
835-
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
836-
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
837835
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
838-
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
839836
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
837+
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
838+
; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
840839
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
841-
; AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
842840
; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx)
841+
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
842+
; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
843843
; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx)
844844
; AVX512BW-NEXT: vzeroupper
845845
; AVX512BW-NEXT: retq
@@ -855,14 +855,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
855855
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
856856
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
857857
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
858-
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
859-
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
860858
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
861-
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
862859
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
860+
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
861+
; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
863862
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
864-
; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
865863
; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
864+
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
865+
; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
866866
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
867867
; AVX512BW-FCP-NEXT: vzeroupper
868868
; AVX512BW-FCP-NEXT: retq
@@ -878,14 +878,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
878878
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
879879
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
880880
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
881-
; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
882-
; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
883881
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
884-
; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
885882
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
883+
; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
884+
; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
886885
; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
887-
; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
888886
; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx)
887+
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
888+
; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
889889
; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx)
890890
; AVX512DQ-BW-NEXT: vzeroupper
891891
; AVX512DQ-BW-NEXT: retq
@@ -901,14 +901,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
901901
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
902902
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
903903
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
904-
; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
905-
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
906904
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
907-
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
908905
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
906+
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
907+
; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
909908
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
910-
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
911909
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
910+
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
911+
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
912912
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
913913
; AVX512DQ-BW-FCP-NEXT: vzeroupper
914914
; AVX512DQ-BW-FCP-NEXT: retq

0 commit comments

Comments
 (0)