@@ -2470,8 +2470,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
2470
2470
; AVX512BW: # %bb.0:
2471
2471
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2472
2472
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2473
- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2474
- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2473
+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
2475
2474
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2476
2475
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
2477
2476
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -2609,8 +2608,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
2609
2608
; AVX512BW: # %bb.0:
2610
2609
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2611
2610
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2612
- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2613
- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2611
+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
2614
2612
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2615
2613
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2616
2614
; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0
@@ -2740,8 +2738,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
2740
2738
; AVX512BW: # %bb.0:
2741
2739
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2742
2740
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2743
- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2744
- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2741
+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
2745
2742
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
2746
2743
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
2747
2744
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -2879,8 +2876,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
2879
2876
; AVX512BW: # %bb.0:
2880
2877
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2881
2878
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2882
- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2883
- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2879
+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
2884
2880
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2885
2881
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2886
2882
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
@@ -3010,8 +3006,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
3010
3006
; AVX512BW: # %bb.0:
3011
3007
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3012
3008
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3013
- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3014
- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3009
+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
3015
3010
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
3016
3011
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
3017
3012
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3148,8 +3143,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
3148
3143
; AVX512BW: # %bb.0:
3149
3144
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3150
3145
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3151
- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3152
- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3146
+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
3153
3147
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3154
3148
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3155
3149
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
@@ -3290,8 +3284,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
3290
3284
; AVX512BW: # %bb.0:
3291
3285
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3292
3286
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3293
- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3294
- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3287
+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
3295
3288
; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3296
3289
; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
3297
3290
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3407,8 +3400,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
3407
3400
; AVX512BW: # %bb.0:
3408
3401
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3409
3402
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3410
- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3411
- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3403
+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
3412
3404
; AVX512BW-NEXT: movw $1, %ax
3413
3405
; AVX512BW-NEXT: kmovd %eax, %k1
3414
3406
; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
@@ -4565,17 +4557,30 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
4565
4557
; AVX512DQ-NEXT: vzeroupper
4566
4558
; AVX512DQ-NEXT: retq
4567
4559
;
4568
- ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4569
- ; AVX512BW: # %bb.0:
4570
- ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4571
- ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
4572
- ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4573
- ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
4574
- ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4575
- ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4576
- ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4577
- ; AVX512BW-NEXT: vzeroupper
4578
- ; AVX512BW-NEXT: retq
4560
+ ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4561
+ ; AVX512BW-SLOW: # %bb.0:
4562
+ ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4563
+ ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4564
+ ; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
4565
+ ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4566
+ ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
4567
+ ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4568
+ ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4569
+ ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4570
+ ; AVX512BW-SLOW-NEXT: vzeroupper
4571
+ ; AVX512BW-SLOW-NEXT: retq
4572
+ ;
4573
+ ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4574
+ ; AVX512BW-FAST: # %bb.0:
4575
+ ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4576
+ ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
4577
+ ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4578
+ ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
4579
+ ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4580
+ ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4581
+ ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4582
+ ; AVX512BW-FAST-NEXT: vzeroupper
4583
+ ; AVX512BW-FAST-NEXT: retq
4579
4584
%in.vec.base = load <64 x i8 >, ptr %in.vec.base.ptr , align 64
4580
4585
%in.vec.bias = load <64 x i8 >, ptr %in.vec.bias.ptr , align 64
4581
4586
%in.vec = add <64 x i8 > %in.vec.base , %in.vec.bias
0 commit comments