Skip to content

Commit 37b79e7

Browse files
RKSimontru
authored andcommitted
[X86] combineConcatVectorOps - only concatenate single-use subops
We could maybe extend this by allowing the lowest subop to have multiple uses and extract the lowest subvector result of the concatenated op, but let's just get the fix in first. Fixes #67333
1 parent 5a13ce2 commit 37b79e7

6 files changed

+358
-164
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57239,7 +57239,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5723957239
// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
5724057240
// but it currently struggles with different vector widths.
5724157241
if (llvm::all_of(Ops, [Op0](SDValue Op) {
57242-
return Op.getOpcode() == Op0.getOpcode();
57242+
return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
5724357243
})) {
5724457244
auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
5724557245
SmallVector<SDValue> Subs;

llvm/test/CodeGen/X86/pr67333.ll

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
3+
4+
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
5+
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
6+
7+
define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 {
8+
; CHECK-LABEL: SHA256_Compress_Generic:
9+
; CHECK: # %bb.0: # %entry
10+
; CHECK-NEXT: movbel 0, %eax
11+
; CHECK-NEXT: movbel 12(%rdi), %ecx
12+
; CHECK-NEXT: vmovd %eax, %xmm0
13+
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,0,1,2,3,128,128,128,128,128,128,128,128]
14+
; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2
15+
; CHECK-NEXT: vpsrld $17, %xmm2, %xmm0
16+
; CHECK-NEXT: vpslld $15, %xmm2, %xmm3
17+
; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0
18+
; CHECK-NEXT: vpsrld $19, %xmm2, %xmm3
19+
; CHECK-NEXT: vpslld $13, %xmm2, %xmm4
20+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
21+
; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0
22+
; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0
23+
; CHECK-NEXT: vmovd %ecx, %xmm3
24+
; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm1
25+
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1
26+
; CHECK-NEXT: vpsrld $17, %xmm1, %xmm0
27+
; CHECK-NEXT: vpslld $15, %xmm1, %xmm3
28+
; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0
29+
; CHECK-NEXT: vpsrld $19, %xmm1, %xmm3
30+
; CHECK-NEXT: vpslld $13, %xmm1, %xmm4
31+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
32+
; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0
33+
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
34+
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
35+
; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3
36+
; CHECK-NEXT: vpslld $15, %xmm0, %xmm4
37+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
38+
; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4
39+
; CHECK-NEXT: vpslld $13, %xmm0, %xmm5
40+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
41+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
42+
; CHECK-NEXT: vpsrld $10, %xmm0, %xmm0
43+
; CHECK-NEXT: vpxor %xmm0, %xmm3, %xmm0
44+
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
45+
; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3
46+
; CHECK-NEXT: vpslld $15, %xmm0, %xmm4
47+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
48+
; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4
49+
; CHECK-NEXT: vpslld $13, %xmm0, %xmm5
50+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
51+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
52+
; CHECK-NEXT: vpsrld $10, %xmm0, %xmm4
53+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
54+
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3]
55+
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,3]
56+
; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
57+
; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3
58+
; CHECK-NEXT: vpslld $15, %xmm2, %xmm4
59+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
60+
; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4
61+
; CHECK-NEXT: vpslld $13, %xmm2, %xmm5
62+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
63+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
64+
; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2
65+
; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2
66+
; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm3
67+
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm1
68+
; CHECK-NEXT: vpsrld $17, %xmm1, %xmm2
69+
; CHECK-NEXT: vpslld $15, %xmm1, %xmm4
70+
; CHECK-NEXT: vpor %xmm2, %xmm4, %xmm2
71+
; CHECK-NEXT: vpsrld $19, %xmm1, %xmm4
72+
; CHECK-NEXT: vpslld $13, %xmm1, %xmm5
73+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
74+
; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2
75+
; CHECK-NEXT: vpsrld $10, %xmm1, %xmm4
76+
; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2
77+
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
78+
; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3
79+
; CHECK-NEXT: vpslld $15, %xmm2, %xmm4
80+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
81+
; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4
82+
; CHECK-NEXT: vpslld $13, %xmm2, %xmm5
83+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
84+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
85+
; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2
86+
; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2
87+
; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0
88+
; CHECK-NEXT: vpsrld $17, %xmm0, %xmm2
89+
; CHECK-NEXT: vpslld $15, %xmm0, %xmm3
90+
; CHECK-NEXT: vpor %xmm2, %xmm3, %xmm2
91+
; CHECK-NEXT: vpsrld $19, %xmm0, %xmm3
92+
; CHECK-NEXT: vpslld $13, %xmm0, %xmm4
93+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
94+
; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2
95+
; CHECK-NEXT: vpsrld $10, %xmm0, %xmm3
96+
; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2
97+
; CHECK-NEXT: vpsllq $32, %xmm1, %xmm3
98+
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
99+
; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
100+
; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
101+
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
102+
; CHECK-NEXT: vmovdqu %ymm0, 132(%rdi)
103+
; CHECK-NEXT: vzeroupper
104+
; CHECK-NEXT: retq
105+
entry:
106+
%0 = load i32, ptr null, align 4
107+
%1 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #3
108+
%arrayidx14 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 3
109+
%2 = load i32, ptr %arrayidx14, align 4
110+
%3 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %2) #3
111+
%4 = insertelement <2 x i32> zeroinitializer, i32 %1, i64 1
112+
%5 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 15, i32 15>)
113+
%6 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 13, i32 13>)
114+
%7 = xor <2 x i32> %5, %6
115+
%8 = lshr <2 x i32> %4, zeroinitializer
116+
%9 = xor <2 x i32> %7, %8
117+
%10 = insertelement <2 x i32> zeroinitializer, i32 %3, i64 0
118+
%11 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %10, <2 x i32> <i32 1, i32 2>
119+
%12 = add <2 x i32> %11, %9
120+
%13 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 15, i32 15>)
121+
%14 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 13, i32 13>)
122+
%15 = xor <2 x i32> %13, %14
123+
%16 = lshr <2 x i32> %12, zeroinitializer
124+
%17 = xor <2 x i32> %15, %16
125+
%18 = add <2 x i32> %4, %17
126+
%19 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 15, i32 15>)
127+
%20 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 13, i32 13>)
128+
%21 = xor <2 x i32> %19, %20
129+
%22 = lshr <2 x i32> %18, <i32 10, i32 10>
130+
%23 = xor <2 x i32> %21, %22
131+
%24 = add <2 x i32> %4, %23
132+
%25 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 15, i32 15>)
133+
%26 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 13, i32 13>)
134+
%27 = xor <2 x i32> %25, %26
135+
%28 = lshr <2 x i32> %24, <i32 10, i32 10>
136+
%29 = xor <2 x i32> %27, %28
137+
%30 = shufflevector <2 x i32> %4, <2 x i32> %12, <2 x i32> <i32 1, i32 2>
138+
%31 = add <2 x i32> %30, %29
139+
%32 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 15, i32 15>)
140+
%33 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 13, i32 13>)
141+
%34 = xor <2 x i32> %32, %33
142+
%35 = lshr <2 x i32> %31, <i32 10, i32 10>
143+
%36 = xor <2 x i32> %34, %35
144+
%37 = shufflevector <2 x i32> %12, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2>
145+
%38 = add <2 x i32> %37, %36
146+
%arrayidx918 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 33
147+
store <2 x i32> %38, ptr %arrayidx918, align 4
148+
%arrayidx1012 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 35
149+
%39 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 15, i32 15>)
150+
%40 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 13, i32 13>)
151+
%41 = xor <2 x i32> %39, %40
152+
%42 = lshr <2 x i32> %38, <i32 10, i32 10>
153+
%43 = xor <2 x i32> %41, %42
154+
%44 = add <2 x i32> %37, %43
155+
store <2 x i32> zeroinitializer, ptr %arrayidx1012, align 4
156+
%arrayidx1106 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 37
157+
%45 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 15, i32 15>)
158+
%46 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 13, i32 13>)
159+
%47 = xor <2 x i32> %45, %46
160+
%48 = lshr <2 x i32> %44, <i32 10, i32 10>
161+
%49 = xor <2 x i32> %47, %48
162+
%50 = lshr <2 x i32> %24, zeroinitializer
163+
%51 = add <2 x i32> %50, %49
164+
store <2 x i32> %51, ptr %arrayidx1106, align 4
165+
%arrayidx1200 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 39
166+
%52 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 15, i32 15>)
167+
%53 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 13, i32 13>)
168+
%54 = xor <2 x i32> %52, %53
169+
%55 = lshr <2 x i32> %51, <i32 10, i32 10>
170+
%56 = xor <2 x i32> %54, %55
171+
%57 = shufflevector <2 x i32> %38, <2 x i32> zeroinitializer, <2 x i32> <i32 poison, i32 0>
172+
%58 = insertelement <2 x i32> %57, i32 0, i64 0
173+
%59 = add <2 x i32> %58, %56
174+
store <2 x i32> %59, ptr %arrayidx1200, align 4
175+
ret void
176+
177+
; uselistorder directives
178+
uselistorder <2 x i32> %4, { 7, 0, 1, 6, 5, 4, 3, 2 }
179+
uselistorder <2 x i32> %38, { 6, 5, 4, 3, 2, 1, 0 }
180+
}
181+
182+
declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #2
183+
184+
; uselistorder directives
185+
uselistorder ptr @llvm.fshl.v2i32, { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }
186+
187+
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
188+
attributes #1 = { nounwind sspstrong memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "probe-stack"="inline-asm" "stack-protector-buffer-size"="8" "target-cpu"="skylake" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" }
189+
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
190+
attributes #3 = { nounwind memory(none) }

llvm/test/CodeGen/X86/subvector-broadcast.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1768,8 +1768,8 @@ define void @PR51226() {
17681768
; X86-AVX2-LABEL: PR51226:
17691769
; X86-AVX2: # %bb.0:
17701770
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1771+
; X86-AVX2-NEXT: vpslld $16, %xmm0, %xmm0
17711772
; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1772-
; X86-AVX2-NEXT: vpslld $16, %ymm0, %ymm0
17731773
; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
17741774
; X86-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0
17751775
; X86-AVX2-NEXT: vmovups %ymm0, (%eax)
@@ -1779,8 +1779,8 @@ define void @PR51226() {
17791779
; X86-AVX512-LABEL: PR51226:
17801780
; X86-AVX512: # %bb.0:
17811781
; X86-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1782+
; X86-AVX512-NEXT: vpslld $16, %xmm0, %xmm0
17821783
; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1783-
; X86-AVX512-NEXT: vpslld $16, %ymm0, %ymm0
17841784
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
17851785
; X86-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0
17861786
; X86-AVX512-NEXT: vmovups %ymm0, (%eax)
@@ -1801,8 +1801,8 @@ define void @PR51226() {
18011801
; X64-AVX2-LABEL: PR51226:
18021802
; X64-AVX2: # %bb.0:
18031803
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1804+
; X64-AVX2-NEXT: vpslld $16, %xmm0, %xmm0
18041805
; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1805-
; X64-AVX2-NEXT: vpslld $16, %ymm0, %ymm0
18061806
; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
18071807
; X64-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0
18081808
; X64-AVX2-NEXT: vmovups %ymm0, (%rax)
@@ -1812,8 +1812,8 @@ define void @PR51226() {
18121812
; X64-AVX512-LABEL: PR51226:
18131813
; X64-AVX512: # %bb.0:
18141814
; X64-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1815+
; X64-AVX512-NEXT: vpslld $16, %xmm0, %xmm0
18151816
; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1816-
; X64-AVX512-NEXT: vpslld $16, %ymm0, %ymm0
18171817
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
18181818
; X64-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0
18191819
; X64-AVX512-NEXT: vmovups %ymm0, (%rax)

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -726,17 +726,19 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
726726
; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
727727
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
728728
; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
729-
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
730-
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
729+
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
730+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
731731
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
732-
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
732+
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
733+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
733734
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
734735
; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm0
735736
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1
736-
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
737-
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
738-
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
739-
; AVX512BW-SLOW-NEXT: vporq %zmm2, %zmm1, %zmm1
737+
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
738+
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
739+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
740+
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
741+
; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm1
740742
; AVX512BW-SLOW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
741743
; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1
742744
; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}

0 commit comments

Comments
 (0)