Skip to content

Commit 5a6e163

Browse files
rohitaggarwal007Rohit Aggarwal
authored andcommitted
[X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast of widen Build Vector (llvm#135010)
I am working on a problem in which a kernel is SLP vectorized and lead to generation of insertelements followed by zext. On lowering, the assembly looks like below: vmovd %r9d, %xmm0 vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0 vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0 vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0 vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0 vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0 vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0 vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0 vpmovzxbw %xmm0, %xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero vpmaddwd (%rdx), %xmm0, %xmm0 After all the insrb, xmm0 looks like xmm0=xmm0[0],xmm0[1],xmm0[2],xmm0[3],xmm0[4],xmm0[5],xmm0[6],xmm0[7],zero,zero,zero,zero,zero,zero,zero,zero Here vpmovzxbw perform the extension of i8 to i16. But it is expensive operation and I want to remove it. Optimization Place the value in correct location while inserting so that zext can be avoid. While lowering, we can write a custom lowerOperation for zero_extend_vector_inreg opcode. We can override the current default operation with my custom in the legalization step. The changes proposed are state below: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0 vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0 vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0 vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0 vpinsrb $a, (%rdi,%rax), %xmm0, %xmm0 vpinsrb $c, (%rdi,%rcx,2), %xmm0, %xmm0 vpinsrb $e, (%rdi,%r8), %xmm0, %xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero vpmaddwd (%rdx), %xmm0, %xmm0 More details in the discourse topic [https://discourse.llvm.org/t/improve-the-gathering-of-the-elements-so-that-unwanted-ext-operations-can-be-avoided/85443](url) --------- Co-authored-by: Rohit Aggarwal <[email protected]>
1 parent b17007e commit 5a6e163

File tree

2 files changed

+166
-141
lines changed

2 files changed

+166
-141
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55934,6 +55934,79 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
5593455934
NegN2);
5593555935
}
5593655936

55937+
// Try to widen the build vector and bitcast it to the type of zext.
55938+
// This is a special case for the 128-bit vector types. Intention is to remove
55939+
// the zext and replace it with a bitcast the wider type. While lowering
55940+
// the bitcast is removed and extra commutation due to zext is avoided.
55941+
// For example:
55942+
// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
55943+
// build_vector (x, 0, y, 0, z, w, 0)
55944+
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
55945+
55946+
if (Extend->getOpcode() != ISD::ZERO_EXTEND)
55947+
return SDValue();
55948+
55949+
EVT ExtendVT = Extend->getValueType(0);
55950+
55951+
SDValue BV = Extend->getOperand(0);
55952+
if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
55953+
return SDValue();
55954+
55955+
if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
55956+
// If the build vector has undef elements, we cannot widen it.
55957+
// The widening would create a vector with more undef elements, which
55958+
// is not valid.
55959+
return SDValue();
55960+
}
55961+
55962+
if (!all_of(BV->op_values(),
55963+
[](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
55964+
// If the build vector any element other than \ISD::LOAD, we cannot widen
55965+
// it.
55966+
return SDValue();
55967+
}
55968+
55969+
SDLoc dl(BV);
55970+
EVT VT = BV.getValueType();
55971+
EVT EltVT = BV.getOperand(0).getValueType();
55972+
unsigned NumElts = VT.getVectorNumElements();
55973+
55974+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55975+
55976+
if (TLI.getTypeAction(*DAG.getContext(), VT) !=
55977+
TargetLowering::TypeWidenVector)
55978+
return SDValue();
55979+
55980+
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
55981+
unsigned WidenNumElts = WidenVT.getVectorNumElements();
55982+
55983+
SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
55984+
assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
55985+
// Fill the new elements with Zero.
55986+
NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
55987+
// Compute the step to place the elements in the right place and control the
55988+
// iteration.
55989+
unsigned step = WidenNumElts / NumElts;
55990+
if (WidenVT.is128BitVector()) {
55991+
if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
55992+
for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
55993+
i--, j -= step) {
55994+
SDValue temp = NewOps[i];
55995+
NewOps[i] = NewOps[j];
55996+
NewOps[j] = temp;
55997+
}
55998+
// Create new build vector with WidenVT and NewOps
55999+
SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56000+
// Replace the old build vector with the new one. Bitcast the
56001+
// new build vector to the type of the zext.
56002+
SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56003+
DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56004+
return NewBV;
56005+
}
56006+
}
56007+
return SDValue();
56008+
}
56009+
5593756010
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
5593856011
TargetLowering::DAGCombinerInfo &DCI,
5593956012
const X86Subtarget &Subtarget) {
@@ -55993,6 +56066,9 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
5599356066
}
5599456067
}
5599556068

56069+
if (SDValue V = widenBuildVec(N, DAG))
56070+
return V;
56071+
5599656072
return SDValue();
5599756073
}
5599856074

llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll

Lines changed: 90 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -31,88 +31,62 @@ define i32 @dot_ext_v8i8_v8i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
3131
; SSE2-NEXT: pinsrw $6, %r9d, %xmm0
3232
; SSE2-NEXT: pinsrw $7, %esi, %xmm0
3333
; SSE2-NEXT: movdqu (%rdx), %xmm1
34-
; SSE2-NEXT: pmaddwd %xmm1, %xmm0
35-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
36-
; SSE2-NEXT: paddd %xmm0, %xmm1
37-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
34+
; SSE2-NEXT: pmaddwd %xmm0, %xmm1
35+
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
3836
; SSE2-NEXT: paddd %xmm1, %xmm0
39-
; SSE2-NEXT: movd %xmm0, %eax
37+
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
38+
; SSE2-NEXT: paddd %xmm0, %xmm1
39+
; SSE2-NEXT: movd %xmm1, %eax
4040
; SSE2-NEXT: popq %rbx
4141
; SSE2-NEXT: popq %r14
4242
; SSE2-NEXT: retq
4343
;
4444
; SSE4-LABEL: dot_ext_v8i8_v8i32:
4545
; SSE4: # %bb.0: # %entry
46-
; SSE4-NEXT: leaq (%rsi,%rsi,4), %rax
47-
; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx
48-
; SSE4-NEXT: leaq (,%rsi,8), %r8
49-
; SSE4-NEXT: movzbl (%rdi), %r9d
50-
; SSE4-NEXT: movd %r9d, %xmm0
51-
; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0
52-
; SSE4-NEXT: pinsrb $2, (%rdi,%rsi,2), %xmm0
53-
; SSE4-NEXT: pinsrb $3, (%rdi,%rcx), %xmm0
54-
; SSE4-NEXT: pinsrb $4, (%rdi,%rsi,4), %xmm0
55-
; SSE4-NEXT: pinsrb $5, (%rdi,%rax), %xmm0
56-
; SSE4-NEXT: pinsrb $6, (%rdi,%rcx,2), %xmm0
57-
; SSE4-NEXT: subq %rsi, %r8
58-
; SSE4-NEXT: pinsrb $7, (%rdi,%r8), %xmm0
46+
; SSE4-NEXT: movzbl (%rdi), %eax
47+
; SSE4-NEXT: leaq (%rsi,%rsi,4), %rcx
48+
; SSE4-NEXT: leaq (%rsi,%rsi,2), %r8
49+
; SSE4-NEXT: leaq (,%rsi,8), %r9
50+
; SSE4-NEXT: subq %rsi, %r9
51+
; SSE4-NEXT: movd %eax, %xmm0
52+
; SSE4-NEXT: pinsrb $2, (%rdi,%rsi), %xmm0
53+
; SSE4-NEXT: pinsrb $4, (%rdi,%rsi,2), %xmm0
54+
; SSE4-NEXT: pinsrb $6, (%rdi,%r8), %xmm0
55+
; SSE4-NEXT: pinsrb $8, (%rdi,%rsi,4), %xmm0
56+
; SSE4-NEXT: pinsrb $10, (%rdi,%rcx), %xmm0
57+
; SSE4-NEXT: pinsrb $12, (%rdi,%r8,2), %xmm0
58+
; SSE4-NEXT: pinsrb $14, (%rdi,%r9), %xmm0
5959
; SSE4-NEXT: movdqu (%rdx), %xmm1
60-
; SSE4-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
61-
; SSE4-NEXT: pmaddwd %xmm1, %xmm0
62-
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
63-
; SSE4-NEXT: paddd %xmm0, %xmm1
64-
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
60+
; SSE4-NEXT: pmaddwd %xmm0, %xmm1
61+
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
6562
; SSE4-NEXT: paddd %xmm1, %xmm0
66-
; SSE4-NEXT: movd %xmm0, %eax
63+
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
64+
; SSE4-NEXT: paddd %xmm0, %xmm1
65+
; SSE4-NEXT: movd %xmm1, %eax
6766
; SSE4-NEXT: retq
6867
;
69-
; AVX2-LABEL: dot_ext_v8i8_v8i32:
70-
; AVX2: # %bb.0: # %entry
71-
; AVX2-NEXT: leaq (%rsi,%rsi,4), %rax
72-
; AVX2-NEXT: leaq (%rsi,%rsi,2), %rcx
73-
; AVX2-NEXT: leaq (,%rsi,8), %r8
74-
; AVX2-NEXT: subq %rsi, %r8
75-
; AVX2-NEXT: movzbl (%rdi), %r9d
76-
; AVX2-NEXT: vmovd %r9d, %xmm0
77-
; AVX2-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
78-
; AVX2-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
79-
; AVX2-NEXT: vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0
80-
; AVX2-NEXT: vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0
81-
; AVX2-NEXT: vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0
82-
; AVX2-NEXT: vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0
83-
; AVX2-NEXT: vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0
84-
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
85-
; AVX2-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
86-
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
87-
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
88-
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
89-
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
90-
; AVX2-NEXT: vmovd %xmm0, %eax
91-
; AVX2-NEXT: retq
92-
;
93-
; AVX512-LABEL: dot_ext_v8i8_v8i32:
94-
; AVX512: # %bb.0: # %entry
95-
; AVX512-NEXT: leaq (%rsi,%rsi,4), %rax
96-
; AVX512-NEXT: leaq (%rsi,%rsi,2), %rcx
97-
; AVX512-NEXT: leaq (,%rsi,8), %r8
98-
; AVX512-NEXT: movzbl (%rdi), %r9d
99-
; AVX512-NEXT: vmovd %r9d, %xmm0
100-
; AVX512-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
101-
; AVX512-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
102-
; AVX512-NEXT: vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0
103-
; AVX512-NEXT: vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0
104-
; AVX512-NEXT: vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0
105-
; AVX512-NEXT: vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0
106-
; AVX512-NEXT: subq %rsi, %r8
107-
; AVX512-NEXT: vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0
108-
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
109-
; AVX512-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
110-
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
111-
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
112-
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
113-
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
114-
; AVX512-NEXT: vmovd %xmm0, %eax
115-
; AVX512-NEXT: retq
68+
; AVX-LABEL: dot_ext_v8i8_v8i32:
69+
; AVX: # %bb.0: # %entry
70+
; AVX-NEXT: movzbl (%rdi), %eax
71+
; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx
72+
; AVX-NEXT: leaq (%rsi,%rsi,4), %r8
73+
; AVX-NEXT: leaq (,%rsi,8), %r9
74+
; AVX-NEXT: subq %rsi, %r9
75+
; AVX-NEXT: vmovd %eax, %xmm0
76+
; AVX-NEXT: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0
77+
; AVX-NEXT: vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0
78+
; AVX-NEXT: vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0
79+
; AVX-NEXT: vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0
80+
; AVX-NEXT: vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0
81+
; AVX-NEXT: vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0
82+
; AVX-NEXT: vpinsrb $14, (%rdi,%r9), %xmm0, %xmm0
83+
; AVX-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
84+
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
85+
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
86+
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
87+
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
88+
; AVX-NEXT: vmovd %xmm0, %eax
89+
; AVX-NEXT: retq
11690
entry:
11791
%var0 = load i8, ptr %a, align 1
11892
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
@@ -175,14 +149,13 @@ define i32 @dot_ext_v4i8_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
175149
;
176150
; SSE4-LABEL: dot_ext_v4i8_v4i32:
177151
; SSE4: # %bb.0: # %entry
178-
; SSE4-NEXT: leaq (%rsi,%rsi,2), %rax
179-
; SSE4-NEXT: movzbl (%rdi), %ecx
180-
; SSE4-NEXT: movd %ecx, %xmm0
181-
; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0
182-
; SSE4-NEXT: pinsrb $2, (%rdi,%rsi,2), %xmm0
183-
; SSE4-NEXT: pinsrb $3, (%rdi,%rax), %xmm0
152+
; SSE4-NEXT: movzbl (%rdi), %eax
153+
; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx
154+
; SSE4-NEXT: movd %eax, %xmm0
155+
; SSE4-NEXT: pinsrb $4, (%rdi,%rsi), %xmm0
156+
; SSE4-NEXT: pinsrb $8, (%rdi,%rsi,2), %xmm0
157+
; SSE4-NEXT: pinsrb $12, (%rdi,%rcx), %xmm0
184158
; SSE4-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
185-
; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
186159
; SSE4-NEXT: pmaddwd %xmm0, %xmm1
187160
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
188161
; SSE4-NEXT: paddd %xmm1, %xmm0
@@ -194,12 +167,11 @@ define i32 @dot_ext_v4i8_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
194167
; AVX-LABEL: dot_ext_v4i8_v4i32:
195168
; AVX: # %bb.0: # %entry
196169
; AVX-NEXT: movzbl (%rdi), %eax
170+
; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx
197171
; AVX-NEXT: vmovd %eax, %xmm0
198-
; AVX-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
199-
; AVX-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
200-
; AVX-NEXT: leaq (%rsi,%rsi,2), %rax
201-
; AVX-NEXT: vpinsrb $3, (%rdi,%rax), %xmm0, %xmm0
202-
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
172+
; AVX-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
173+
; AVX-NEXT: vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0
174+
; AVX-NEXT: vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0
203175
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
204176
; AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
205177
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -311,8 +283,7 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
311283
; SSE4: # %bb.0:
312284
; SSE4-NEXT: movzbl (%rdi), %eax
313285
; SSE4-NEXT: movd %eax, %xmm0
314-
; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0
315-
; SSE4-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
286+
; SSE4-NEXT: pinsrb $8, (%rdi,%rsi), %xmm0
316287
; SSE4-NEXT: pmovsxbq (%rdx), %xmm1
317288
; SSE4-NEXT: pmuldq %xmm0, %xmm1
318289
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
@@ -324,8 +295,7 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
324295
; AVX: # %bb.0:
325296
; AVX-NEXT: movzbl (%rdi), %eax
326297
; AVX-NEXT: vmovd %eax, %xmm0
327-
; AVX-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
328-
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
298+
; AVX-NEXT: vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0
329299
; AVX-NEXT: vpmovsxbq (%rdx), %xmm1
330300
; AVX-NEXT: vpmuldq %xmm0, %xmm1, %xmm0
331301
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -374,14 +344,13 @@ define i32 @dot_ext_v4i16_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
374344
;
375345
; SSE4-LABEL: dot_ext_v4i16_v4i32:
376346
; SSE4: # %bb.0: # %entry
377-
; SSE4-NEXT: leaq (%rsi,%rsi,2), %rax
378-
; SSE4-NEXT: movzwl (%rdi), %ecx
379-
; SSE4-NEXT: movd %ecx, %xmm0
380-
; SSE4-NEXT: pinsrw $1, (%rdi,%rsi), %xmm0
381-
; SSE4-NEXT: pinsrw $2, (%rdi,%rsi,2), %xmm0
382-
; SSE4-NEXT: pinsrw $3, (%rdi,%rax), %xmm0
347+
; SSE4-NEXT: movzwl (%rdi), %eax
348+
; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx
349+
; SSE4-NEXT: movd %eax, %xmm0
350+
; SSE4-NEXT: pinsrw $2, (%rdi,%rsi), %xmm0
351+
; SSE4-NEXT: pinsrw $4, (%rdi,%rsi,2), %xmm0
352+
; SSE4-NEXT: pinsrw $6, (%rdi,%rcx), %xmm0
383353
; SSE4-NEXT: pmovsxwd (%rdx), %xmm1
384-
; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
385354
; SSE4-NEXT: pmulld %xmm0, %xmm1
386355
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
387356
; SSE4-NEXT: paddd %xmm1, %xmm0
@@ -390,41 +359,22 @@ define i32 @dot_ext_v4i16_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
390359
; SSE4-NEXT: movd %xmm1, %eax
391360
; SSE4-NEXT: retq
392361
;
393-
; AVX2-LABEL: dot_ext_v4i16_v4i32:
394-
; AVX2: # %bb.0: # %entry
395-
; AVX2-NEXT: leaq (%rsi,%rsi,2), %rax
396-
; AVX2-NEXT: movzwl (%rdi), %ecx
397-
; AVX2-NEXT: vmovd %ecx, %xmm0
398-
; AVX2-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0
399-
; AVX2-NEXT: vpinsrw $2, (%rdi,%rsi,2), %xmm0, %xmm0
400-
; AVX2-NEXT: vpinsrw $3, (%rdi,%rax), %xmm0, %xmm0
401-
; AVX2-NEXT: vpmovsxwd (%rdx), %xmm1
402-
; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
403-
; AVX2-NEXT: vpmulld %xmm0, %xmm1, %xmm0
404-
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
405-
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
406-
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
407-
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
408-
; AVX2-NEXT: vmovd %xmm0, %eax
409-
; AVX2-NEXT: retq
410-
;
411-
; AVX512-LABEL: dot_ext_v4i16_v4i32:
412-
; AVX512: # %bb.0: # %entry
413-
; AVX512-NEXT: leaq (%rsi,%rsi,2), %rax
414-
; AVX512-NEXT: movzwl (%rdi), %ecx
415-
; AVX512-NEXT: vmovd %ecx, %xmm0
416-
; AVX512-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0
417-
; AVX512-NEXT: vpinsrw $2, (%rdi,%rsi,2), %xmm0, %xmm0
418-
; AVX512-NEXT: vpinsrw $3, (%rdi,%rax), %xmm0, %xmm0
419-
; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
420-
; AVX512-NEXT: vpmovsxwd (%rdx), %xmm1
421-
; AVX512-NEXT: vpmulld %xmm0, %xmm1, %xmm0
422-
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
423-
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
424-
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
425-
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
426-
; AVX512-NEXT: vmovd %xmm0, %eax
427-
; AVX512-NEXT: retq
362+
; AVX-LABEL: dot_ext_v4i16_v4i32:
363+
; AVX: # %bb.0: # %entry
364+
; AVX-NEXT: movzwl (%rdi), %eax
365+
; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx
366+
; AVX-NEXT: vmovd %eax, %xmm0
367+
; AVX-NEXT: vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0
368+
; AVX-NEXT: vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0
369+
; AVX-NEXT: vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0
370+
; AVX-NEXT: vpmovsxwd (%rdx), %xmm1
371+
; AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
372+
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
373+
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
374+
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
375+
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
376+
; AVX-NEXT: vmovd %xmm0, %eax
377+
; AVX-NEXT: retq
428378
entry:
429379
%var0 = load i16, ptr %a, align 1
430380
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
@@ -509,16 +459,15 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
509459
; SSE2: # %bb.0:
510460
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
511461
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
512-
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
513-
; SSE2-NEXT: pxor %xmm0, %xmm0
514-
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
515-
; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
516-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
517-
; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
518-
; SSE2-NEXT: pmuludq %xmm1, %xmm0
519-
; SSE2-NEXT: psllq $32, %xmm0
520-
; SSE2-NEXT: pmuludq %xmm2, %xmm1
521-
; SSE2-NEXT: paddq %xmm0, %xmm1
462+
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
463+
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
464+
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
465+
; SSE2-NEXT: pxor %xmm2, %xmm2
466+
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
467+
; SSE2-NEXT: pmuludq %xmm1, %xmm2
468+
; SSE2-NEXT: psllq $32, %xmm2
469+
; SSE2-NEXT: pmuludq %xmm0, %xmm1
470+
; SSE2-NEXT: paddq %xmm2, %xmm1
522471
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
523472
; SSE2-NEXT: paddq %xmm1, %xmm0
524473
; SSE2-NEXT: movq %xmm0, %rax
@@ -560,8 +509,8 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
560509
; AVX512-LABEL: dot_ext_v2i32_v2i64:
561510
; AVX512: # %bb.0:
562511
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
563-
; AVX512-NEXT: vpinsrd $1, (%rdi,%rsi), %xmm0, %xmm0
564-
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
512+
; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
513+
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
565514
; AVX512-NEXT: vpmovsxdq (%rdx), %xmm1
566515
; AVX512-NEXT: vpmullq %xmm0, %xmm1, %xmm0
567516
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]

0 commit comments

Comments
 (0)