@@ -740,14 +740,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
740
740
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
741
741
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
742
742
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
743
- ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
744
- ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
745
743
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
746
- ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
747
744
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
745
+ ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
746
+ ; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
748
747
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
749
- ; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
750
748
; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx)
749
+ ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
750
+ ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
751
751
; AVX512-NEXT: vmovdqa %ymm0, (%rcx)
752
752
; AVX512-NEXT: vzeroupper
753
753
; AVX512-NEXT: retq
@@ -763,14 +763,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
763
763
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
764
764
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
765
765
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
766
- ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
767
- ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
768
766
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
769
- ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
770
767
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
768
+ ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
769
+ ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
771
770
; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
772
- ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
773
771
; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
772
+ ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
773
+ ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
774
774
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
775
775
; AVX512-FCP-NEXT: vzeroupper
776
776
; AVX512-FCP-NEXT: retq
@@ -786,14 +786,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
786
786
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
787
787
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
788
788
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
789
- ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
790
- ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
791
789
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
792
- ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
793
790
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
791
+ ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
792
+ ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
794
793
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2
795
- ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
796
794
; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx)
795
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
796
+ ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
797
797
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
798
798
; AVX512DQ-NEXT: vzeroupper
799
799
; AVX512DQ-NEXT: retq
@@ -809,14 +809,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
809
809
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
810
810
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
811
811
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
812
- ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
813
- ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
814
812
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
815
- ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
816
813
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
814
+ ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
815
+ ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
817
816
; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
818
- ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
819
817
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
818
+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
819
+ ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
820
820
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
821
821
; AVX512DQ-FCP-NEXT: vzeroupper
822
822
; AVX512DQ-FCP-NEXT: retq
@@ -832,14 +832,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
832
832
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
833
833
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
834
834
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
835
- ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
836
- ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
837
835
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
838
- ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
839
836
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
837
+ ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
838
+ ; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
840
839
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
841
- ; AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
842
840
; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx)
841
+ ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
842
+ ; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
843
843
; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx)
844
844
; AVX512BW-NEXT: vzeroupper
845
845
; AVX512BW-NEXT: retq
@@ -855,14 +855,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
855
855
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
856
856
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
857
857
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
858
- ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
859
- ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
860
858
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
861
- ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
862
859
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
860
+ ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
861
+ ; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
863
862
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
864
- ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
865
863
; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
864
+ ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
865
+ ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
866
866
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
867
867
; AVX512BW-FCP-NEXT: vzeroupper
868
868
; AVX512BW-FCP-NEXT: retq
@@ -878,14 +878,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
878
878
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
879
879
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
880
880
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
881
- ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
882
- ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
883
881
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
884
- ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
885
882
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
883
+ ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
884
+ ; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
886
885
; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
887
- ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
888
886
; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx)
887
+ ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
888
+ ; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
889
889
; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx)
890
890
; AVX512DQ-BW-NEXT: vzeroupper
891
891
; AVX512DQ-BW-NEXT: retq
@@ -901,14 +901,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
901
901
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
902
902
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
903
903
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
904
- ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
905
- ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
906
904
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
907
- ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
908
905
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
906
+ ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
907
+ ; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
909
908
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
910
- ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
911
909
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
910
+ ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
911
+ ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
912
912
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
913
913
; AVX512DQ-BW-FCP-NEXT: vzeroupper
914
914
; AVX512DQ-BW-FCP-NEXT: retq
0 commit comments