Skip to content

Commit 09c8f38

Browse files
committed
[X86] Add isel patterns for X86VBroadcast with i16 truncates from i16->i64 zextload/extload.
We can form vpbroadcastw with a folded load. We had patterns for i16->i32 zextload/extload, but nothing prevents i64 from occuring. I'd like to move this all to DAG combine to fix more cases, but this is trivial fix to minimize test diffs when moving to a combine.
1 parent 51a4c61 commit 09c8f38

File tree

5 files changed

+79
-115
lines changed

5 files changed

+79
-115
lines changed

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1427,26 +1427,46 @@ let Predicates = [HasVLX, HasBWI] in {
14271427
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
14281428
// This means we'll encounter truncated i32 loads; match that here.
14291429
def : Pat<(v8i16 (X86VBroadcast
1430-
(i16 (trunc (i32 (extloadi16 addr:$src)))))),
1430+
(i16 (trunc (extloadi32i16 addr:$src))))),
14311431
(VPBROADCASTWZ128rm addr:$src)>;
14321432
def : Pat<(v8i16 (X86VBroadcast
1433-
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
1433+
(i16 (trunc (zextloadi32i16 addr:$src))))),
14341434
(VPBROADCASTWZ128rm addr:$src)>;
14351435
def : Pat<(v16i16 (X86VBroadcast
1436-
(i16 (trunc (i32 (extloadi16 addr:$src)))))),
1436+
(i16 (trunc (extloadi32i16 addr:$src))))),
14371437
(VPBROADCASTWZ256rm addr:$src)>;
14381438
def : Pat<(v16i16 (X86VBroadcast
1439-
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
1439+
(i16 (trunc (zextloadi32i16 addr:$src))))),
1440+
(VPBROADCASTWZ256rm addr:$src)>;
1441+
1442+
def : Pat<(v8i16 (X86VBroadcast
1443+
(i16 (trunc (extloadi64i16 addr:$src))))),
1444+
(VPBROADCASTWZ128rm addr:$src)>;
1445+
def : Pat<(v8i16 (X86VBroadcast
1446+
(i16 (trunc (zextloadi64i16 addr:$src))))),
1447+
(VPBROADCASTWZ128rm addr:$src)>;
1448+
def : Pat<(v16i16 (X86VBroadcast
1449+
(i16 (trunc (extloadi64i16 addr:$src))))),
1450+
(VPBROADCASTWZ256rm addr:$src)>;
1451+
def : Pat<(v16i16 (X86VBroadcast
1452+
(i16 (trunc (zextloadi64i16 addr:$src))))),
14401453
(VPBROADCASTWZ256rm addr:$src)>;
14411454
}
14421455
let Predicates = [HasBWI] in {
14431456
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
14441457
// This means we'll encounter truncated i32 loads; match that here.
14451458
def : Pat<(v32i16 (X86VBroadcast
1446-
(i16 (trunc (i32 (extloadi16 addr:$src)))))),
1459+
(i16 (trunc (extloadi32i16 addr:$src))))),
1460+
(VPBROADCASTWZrm addr:$src)>;
1461+
def : Pat<(v32i16 (X86VBroadcast
1462+
(i16 (trunc (zextloadi32i16 addr:$src))))),
1463+
(VPBROADCASTWZrm addr:$src)>;
1464+
1465+
def : Pat<(v32i16 (X86VBroadcast
1466+
(i16 (trunc (extloadi64i16 addr:$src))))),
14471467
(VPBROADCASTWZrm addr:$src)>;
14481468
def : Pat<(v32i16 (X86VBroadcast
1449-
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
1469+
(i16 (trunc (zextloadi64i16 addr:$src))))),
14501470
(VPBROADCASTWZrm addr:$src)>;
14511471
}
14521472

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7518,16 +7518,29 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
75187518
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
75197519
// This means we'll encounter truncated i32 loads; match that here.
75207520
def : Pat<(v8i16 (X86VBroadcast
7521-
(i16 (trunc (i32 (extloadi16 addr:$src)))))),
7521+
(i16 (trunc (extloadi32i16 addr:$src))))),
75227522
(VPBROADCASTWrm addr:$src)>;
75237523
def : Pat<(v8i16 (X86VBroadcast
7524-
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7524+
(i16 (trunc (zextloadi32i16 addr:$src))))),
75257525
(VPBROADCASTWrm addr:$src)>;
75267526
def : Pat<(v16i16 (X86VBroadcast
7527-
(i16 (trunc (i32 (extloadi16 addr:$src)))))),
7527+
(i16 (trunc (extloadi32i16 addr:$src))))),
75287528
(VPBROADCASTWYrm addr:$src)>;
75297529
def : Pat<(v16i16 (X86VBroadcast
7530-
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7530+
(i16 (trunc (zextloadi32i16 addr:$src))))),
7531+
(VPBROADCASTWYrm addr:$src)>;
7532+
7533+
def : Pat<(v8i16 (X86VBroadcast
7534+
(i16 (trunc (extloadi64i16 addr:$src))))),
7535+
(VPBROADCASTWrm addr:$src)>;
7536+
def : Pat<(v8i16 (X86VBroadcast
7537+
(i16 (trunc (zextloadi64i16 addr:$src))))),
7538+
(VPBROADCASTWrm addr:$src)>;
7539+
def : Pat<(v16i16 (X86VBroadcast
7540+
(i16 (trunc (extloadi64i16 addr:$src))))),
7541+
(VPBROADCASTWYrm addr:$src)>;
7542+
def : Pat<(v16i16 (X86VBroadcast
7543+
(i16 (trunc (zextloadi64i16 addr:$src))))),
75317544
(VPBROADCASTWYrm addr:$src)>;
75327545
}
75337546

llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll

Lines changed: 15 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3331,18 +3331,10 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(i64* %ptr) {
33313331
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
33323332
; AVX1-NEXT: retq
33333333
;
3334-
; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i64:
3335-
; AVX2: # %bb.0:
3336-
; AVX2-NEXT: movzwl 6(%rdi), %eax
3337-
; AVX2-NEXT: vmovd %eax, %xmm0
3338-
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3339-
; AVX2-NEXT: retq
3340-
;
3341-
; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_i64:
3342-
; AVX512VL: # %bb.0:
3343-
; AVX512VL-NEXT: movzwl 6(%rdi), %eax
3344-
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
3345-
; AVX512VL-NEXT: retq
3334+
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i64:
3335+
; AVX2OR512VL: # %bb.0:
3336+
; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %xmm0
3337+
; AVX2OR512VL-NEXT: retq
33463338
;
33473339
; XOPAVX1-LABEL: insert_dup_elt3_mem_v8i16_i64:
33483340
; XOPAVX1: # %bb.0:
@@ -3353,9 +3345,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(i64* %ptr) {
33533345
;
33543346
; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i64:
33553347
; XOPAVX2: # %bb.0:
3356-
; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
3357-
; XOPAVX2-NEXT: vmovd %eax, %xmm0
3358-
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3348+
; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %xmm0
33593349
; XOPAVX2-NEXT: retq
33603350
%tmp = load i64, i64* %ptr, align 4
33613351
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
@@ -3392,18 +3382,10 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(i64* %ptr) {
33923382
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
33933383
; AVX1-NEXT: retq
33943384
;
3395-
; AVX2-LABEL: insert_dup_elt7_mem_v8i16_i64:
3396-
; AVX2: # %bb.0:
3397-
; AVX2-NEXT: movzwl 6(%rdi), %eax
3398-
; AVX2-NEXT: vmovd %eax, %xmm0
3399-
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3400-
; AVX2-NEXT: retq
3401-
;
3402-
; AVX512VL-LABEL: insert_dup_elt7_mem_v8i16_i64:
3403-
; AVX512VL: # %bb.0:
3404-
; AVX512VL-NEXT: movzwl 6(%rdi), %eax
3405-
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
3406-
; AVX512VL-NEXT: retq
3385+
; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v8i16_i64:
3386+
; AVX2OR512VL: # %bb.0:
3387+
; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %xmm0
3388+
; AVX2OR512VL-NEXT: retq
34073389
;
34083390
; XOPAVX1-LABEL: insert_dup_elt7_mem_v8i16_i64:
34093391
; XOPAVX1: # %bb.0:
@@ -3414,9 +3396,7 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(i64* %ptr) {
34143396
;
34153397
; XOPAVX2-LABEL: insert_dup_elt7_mem_v8i16_i64:
34163398
; XOPAVX2: # %bb.0:
3417-
; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
3418-
; XOPAVX2-NEXT: vmovd %eax, %xmm0
3419-
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3399+
; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %xmm0
34203400
; XOPAVX2-NEXT: retq
34213401
%tmp = load i64, i64* %ptr, align 4
34223402
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
@@ -3442,18 +3422,10 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) {
34423422
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
34433423
; AVX1-NEXT: retq
34443424
;
3445-
; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
3446-
; AVX2: # %bb.0:
3447-
; AVX2-NEXT: movzwl (%rdi), %eax
3448-
; AVX2-NEXT: vmovd %eax, %xmm0
3449-
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3450-
; AVX2-NEXT: retq
3451-
;
3452-
; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
3453-
; AVX512VL: # %bb.0:
3454-
; AVX512VL-NEXT: movzwl (%rdi), %eax
3455-
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
3456-
; AVX512VL-NEXT: retq
3425+
; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
3426+
; AVX2OR512VL: # %bb.0:
3427+
; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %xmm0
3428+
; AVX2OR512VL-NEXT: retq
34573429
;
34583430
; XOPAVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
34593431
; XOPAVX1: # %bb.0:
@@ -3465,9 +3437,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) {
34653437
;
34663438
; XOPAVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
34673439
; XOPAVX2: # %bb.0:
3468-
; XOPAVX2-NEXT: movzwl (%rdi), %eax
3469-
; XOPAVX2-NEXT: vmovd %eax, %xmm0
3470-
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3440+
; XOPAVX2-NEXT: vpbroadcastw (%rdi), %xmm0
34713441
; XOPAVX2-NEXT: retq
34723442
%tmp = load i16, i16* %ptr, align 2
34733443
%tmp1 = sext i16 %tmp to i64

llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll

Lines changed: 15 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -7546,18 +7546,10 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
75467546
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
75477547
; AVX1-NEXT: retq
75487548
;
7549-
; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i64:
7550-
; AVX2: # %bb.0:
7551-
; AVX2-NEXT: movzwl 6(%rdi), %eax
7552-
; AVX2-NEXT: vmovd %eax, %xmm0
7553-
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
7554-
; AVX2-NEXT: retq
7555-
;
7556-
; AVX512VL-LABEL: insert_dup_elt3_mem_v16i16_i64:
7557-
; AVX512VL: # %bb.0:
7558-
; AVX512VL-NEXT: movzwl 6(%rdi), %eax
7559-
; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
7560-
; AVX512VL-NEXT: retq
7549+
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i64:
7550+
; AVX2OR512VL: # %bb.0:
7551+
; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %ymm0
7552+
; AVX2OR512VL-NEXT: retq
75617553
;
75627554
; XOPAVX1-LABEL: insert_dup_elt3_mem_v16i16_i64:
75637555
; XOPAVX1: # %bb.0:
@@ -7569,9 +7561,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
75697561
;
75707562
; XOPAVX2-LABEL: insert_dup_elt3_mem_v16i16_i64:
75717563
; XOPAVX2: # %bb.0:
7572-
; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
7573-
; XOPAVX2-NEXT: vmovd %eax, %xmm0
7574-
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
7564+
; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %ymm0
75757565
; XOPAVX2-NEXT: retq
75767566
%tmp = load i64, i64* %ptr, align 4
75777567
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
@@ -7588,18 +7578,10 @@ define <16 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
75887578
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
75897579
; AVX1-NEXT: retq
75907580
;
7591-
; AVX2-LABEL: insert_dup_elt7_mem_v16i16_i64:
7592-
; AVX2: # %bb.0:
7593-
; AVX2-NEXT: movzwl 6(%rdi), %eax
7594-
; AVX2-NEXT: vmovd %eax, %xmm0
7595-
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
7596-
; AVX2-NEXT: retq
7597-
;
7598-
; AVX512VL-LABEL: insert_dup_elt7_mem_v16i16_i64:
7599-
; AVX512VL: # %bb.0:
7600-
; AVX512VL-NEXT: movzwl 6(%rdi), %eax
7601-
; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
7602-
; AVX512VL-NEXT: retq
7581+
; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v16i16_i64:
7582+
; AVX2OR512VL: # %bb.0:
7583+
; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %ymm0
7584+
; AVX2OR512VL-NEXT: retq
76037585
;
76047586
; XOPAVX1-LABEL: insert_dup_elt7_mem_v16i16_i64:
76057587
; XOPAVX1: # %bb.0:
@@ -7610,9 +7592,7 @@ define <16 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
76107592
;
76117593
; XOPAVX2-LABEL: insert_dup_elt7_mem_v16i16_i64:
76127594
; XOPAVX2: # %bb.0:
7613-
; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
7614-
; XOPAVX2-NEXT: vmovd %eax, %xmm0
7615-
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
7595+
; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %ymm0
76167596
; XOPAVX2-NEXT: retq
76177597
%tmp = load i64, i64* %ptr, align 4
76187598
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
@@ -7631,18 +7611,10 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
76317611
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
76327612
; AVX1-NEXT: retq
76337613
;
7634-
; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
7635-
; AVX2: # %bb.0:
7636-
; AVX2-NEXT: movzwl (%rdi), %eax
7637-
; AVX2-NEXT: vmovd %eax, %xmm0
7638-
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
7639-
; AVX2-NEXT: retq
7640-
;
7641-
; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
7642-
; AVX512VL: # %bb.0:
7643-
; AVX512VL-NEXT: movzwl (%rdi), %eax
7644-
; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
7645-
; AVX512VL-NEXT: retq
7614+
; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
7615+
; AVX2OR512VL: # %bb.0:
7616+
; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0
7617+
; AVX2OR512VL-NEXT: retq
76467618
;
76477619
; XOPAVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
76487620
; XOPAVX1: # %bb.0:
@@ -7655,9 +7627,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
76557627
;
76567628
; XOPAVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
76577629
; XOPAVX2: # %bb.0:
7658-
; XOPAVX2-NEXT: movzwl (%rdi), %eax
7659-
; XOPAVX2-NEXT: vmovd %eax, %xmm0
7660-
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
7630+
; XOPAVX2-NEXT: vpbroadcastw (%rdi), %ymm0
76617631
; XOPAVX2-NEXT: retq
76627632
%tmp = load i16, i16* %ptr, align 2
76637633
%tmp1 = sext i16 %tmp to i64

llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -333,16 +333,13 @@ define <32 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
333333
define <32 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
334334
; KNL-LABEL: insert_dup_elt3_mem_v16i16_i64:
335335
; KNL: ## %bb.0:
336-
; KNL-NEXT: movzwl 6(%rdi), %eax
337-
; KNL-NEXT: vmovd %eax, %xmm0
338-
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
336+
; KNL-NEXT: vpbroadcastw 6(%rdi), %ymm0
339337
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
340338
; KNL-NEXT: retq
341339
;
342340
; SKX-LABEL: insert_dup_elt3_mem_v16i16_i64:
343341
; SKX: ## %bb.0:
344-
; SKX-NEXT: movzwl 6(%rdi), %eax
345-
; SKX-NEXT: vpbroadcastw %eax, %zmm0
342+
; SKX-NEXT: vpbroadcastw 6(%rdi), %zmm0
346343
; SKX-NEXT: retq
347344
%tmp = load i64, i64* %ptr, align 4
348345
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
@@ -354,16 +351,13 @@ define <32 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
354351
define <32 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
355352
; KNL-LABEL: insert_dup_elt7_mem_v16i16_i64:
356353
; KNL: ## %bb.0:
357-
; KNL-NEXT: movzwl 6(%rdi), %eax
358-
; KNL-NEXT: vmovd %eax, %xmm0
359-
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
354+
; KNL-NEXT: vpbroadcastw 6(%rdi), %ymm0
360355
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
361356
; KNL-NEXT: retq
362357
;
363358
; SKX-LABEL: insert_dup_elt7_mem_v16i16_i64:
364359
; SKX: ## %bb.0:
365-
; SKX-NEXT: movzwl 6(%rdi), %eax
366-
; SKX-NEXT: vpbroadcastw %eax, %zmm0
360+
; SKX-NEXT: vpbroadcastw 6(%rdi), %zmm0
367361
; SKX-NEXT: retq
368362
%tmp = load i64, i64* %ptr, align 4
369363
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
@@ -375,16 +369,13 @@ define <32 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
375369
define <32 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
376370
; KNL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
377371
; KNL: ## %bb.0:
378-
; KNL-NEXT: movzwl (%rdi), %eax
379-
; KNL-NEXT: vmovd %eax, %xmm0
380-
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
372+
; KNL-NEXT: vpbroadcastw (%rdi), %ymm0
381373
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
382374
; KNL-NEXT: retq
383375
;
384376
; SKX-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
385377
; SKX: ## %bb.0:
386-
; SKX-NEXT: movzwl (%rdi), %eax
387-
; SKX-NEXT: vpbroadcastw %eax, %zmm0
378+
; SKX-NEXT: vpbroadcastw (%rdi), %zmm0
388379
; SKX-NEXT: retq
389380
%tmp = load i16, i16* %ptr, align 2
390381
%tmp1 = sext i16 %tmp to i64

0 commit comments

Comments
 (0)