Skip to content

Commit 61e556d

Browse files
committed
Recommit r358887 "[TargetLowering][AMDGPU][X86] Improve SimplifyDemandedBits bitcast handling"
I've included a new fix in X86RegisterInfo to prevent PR41619 without reintroducing r359392. We might be able to improve that in the base class implementation of shouldRewriteCopySrc somehow. But this hopefully enables forward progress on SimplifyDemandedBits improvements for now. Original commit message: This patch adds support for BigBitWidth -> SmallBitWidth bitcasts, splitting the DemandedBits/Elts accordingly. The AMDGPU backend needed an extra (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) combine to encourage BFE creation, I investigated putting this in DAGComb but it caused a lot of noise on other targets - some improvements, some regressions. The X86 changes are all definite wins. llvm-svn: 360552
1 parent 3e6d690 commit 61e556d

File tree

9 files changed

+107
-139
lines changed

9 files changed

+107
-139
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1577,6 +1577,30 @@ bool TargetLowering::SimplifyDemandedBits(
15771577
KnownSrcZero, TLO, Depth + 1))
15781578
return true;
15791579

1580+
KnownBits KnownSrcBits;
1581+
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
1582+
KnownSrcBits, TLO, Depth + 1))
1583+
return true;
1584+
} else if ((NumSrcEltBits % BitWidth) == 0 &&
1585+
TLO.DAG.getDataLayout().isLittleEndian()) {
1586+
unsigned Scale = NumSrcEltBits / BitWidth;
1587+
unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
1588+
APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
1589+
APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
1590+
for (unsigned i = 0; i != NumElts; ++i)
1591+
if (DemandedElts[i]) {
1592+
unsigned Offset = (i % Scale) * BitWidth;
1593+
DemandedSrcBits.insertBits(DemandedBits, Offset);
1594+
DemandedSrcElts.setBit(i / Scale);
1595+
}
1596+
1597+
if (SrcVT.isVector()) {
1598+
APInt KnownSrcUndef, KnownSrcZero;
1599+
if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
1600+
KnownSrcZero, TLO, Depth + 1))
1601+
return true;
1602+
}
1603+
15801604
KnownBits KnownSrcBits;
15811605
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
15821606
KnownSrcBits, TLO, Depth + 1))
@@ -1586,7 +1610,7 @@ bool TargetLowering::SimplifyDemandedBits(
15861610
// If this is a bitcast, let computeKnownBits handle it. Only do this on a
15871611
// recursive call where Known may be useful to the caller.
15881612
if (Depth > 0) {
1589-
Known = TLO.DAG.computeKnownBits(Op, Depth);
1613+
Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
15901614
return false;
15911615
}
15921616
break;

llvm/lib/Target/X86/X86RegisterInfo.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,21 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
216216
}
217217
}
218218

219+
bool X86RegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
220+
unsigned DefSubReg,
221+
const TargetRegisterClass *SrcRC,
222+
unsigned SrcSubReg) const {
223+
// Prevent rewriting a copy where the destination size is larger than the
224+
// input size. See PR41619.
225+
// FIXME: Should this be factored into the base implementation somehow.
226+
if (DefRC->hasSuperClassEq(&X86::GR64RegClass) && DefSubReg == 0 &&
227+
SrcRC->hasSuperClassEq(&X86::GR64RegClass) && SrcSubReg == X86::sub_32bit)
228+
return false;
229+
230+
return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg,
231+
SrcRC, SrcSubReg);
232+
}
233+
219234
const TargetRegisterClass *
220235
X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
221236
const Function &F = MF.getFunction();

llvm/lib/Target/X86/X86RegisterInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ class X86RegisterInfo final : public X86GenRegisterInfo {
7474
getLargestLegalSuperClass(const TargetRegisterClass *RC,
7575
const MachineFunction &MF) const override;
7676

77+
bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
78+
unsigned DefSubReg,
79+
const TargetRegisterClass *SrcRC,
80+
unsigned SrcSubReg) const override;
81+
7782
/// getPointerRegClass - Returns a TargetRegisterClass used for pointer
7883
/// values.
7984
const TargetRegisterClass *

llvm/test/CodeGen/X86/bitcast-setcc-256.ll

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -448,22 +448,6 @@ define void @bitcast_8i32_store(i8* %p, <8 x i32> %a0) {
448448
define void @bitcast_4i64_store(i4* %p, <4 x i64> %a0) {
449449
; SSE2-SSSE3-LABEL: bitcast_4i64_store:
450450
; SSE2-SSSE3: # %bb.0:
451-
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
452-
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
453-
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
454-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
455-
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
456-
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
457-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
458-
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
459-
; SSE2-SSSE3-NEXT: por %xmm4, %xmm1
460-
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
461-
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
462-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
463-
; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
464-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
465-
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
466-
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
467451
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
468452
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
469453
; SSE2-SSSE3-NEXT: movb %al, (%rdi)

llvm/test/CodeGen/X86/bitcast-setcc-512.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -609,15 +609,13 @@ define void @bitcast_8i64_store(i8* %p, <8 x i64> %a0) {
609609
;
610610
; AVX1-LABEL: bitcast_8i64_store:
611611
; AVX1: # %bb.0:
612-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
613-
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
614-
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
615-
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
616-
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
617612
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
613+
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
618614
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
619615
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
620616
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
617+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
618+
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
621619
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
622620
; AVX1-NEXT: vmovmskps %ymm0, %eax
623621
; AVX1-NEXT: movb %al, (%rdi)

llvm/test/CodeGen/X86/bitcast-vector-bool.ll

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -208,22 +208,6 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
208208
define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind {
209209
; SSE2-SSSE3-LABEL: bitcast_v4i64_to_v2i2:
210210
; SSE2-SSSE3: # %bb.0:
211-
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
212-
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
213-
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
214-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
215-
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
216-
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
217-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
218-
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
219-
; SSE2-SSSE3-NEXT: por %xmm4, %xmm1
220-
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
221-
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
222-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
223-
; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
224-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
225-
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
226-
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
227211
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
228212
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
229213
; SSE2-SSSE3-NEXT: movl %eax, %ecx
@@ -532,15 +516,13 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
532516
;
533517
; AVX1-LABEL: bitcast_v8i64_to_v2i4:
534518
; AVX1: # %bb.0:
535-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
536-
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
537-
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
538-
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
539-
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
540519
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
520+
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
541521
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
542522
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
543523
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
524+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
525+
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
544526
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
545527
; AVX1-NEXT: vmovmskps %ymm0, %eax
546528
; AVX1-NEXT: movl %eax, %ecx

llvm/test/CodeGen/X86/dagcombine-cse.ll

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,11 @@ define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n
1414
;
1515
; X64-LABEL: t:
1616
; X64: ## %bb.0: ## %entry
17-
; X64-NEXT: ## kill: def $edx killed $edx def $rdx
18-
; X64-NEXT: ## kill: def $esi killed $esi def $rsi
1917
; X64-NEXT: imull %ecx, %esi
20-
; X64-NEXT: leal (%rsi,%rdx), %eax
21-
; X64-NEXT: cltq
18+
; X64-NEXT: addl %edx, %esi
19+
; X64-NEXT: movslq %esi, %rax
2220
; X64-NEXT: movl (%rdi,%rax), %eax
23-
; X64-NEXT: leal 4(%rsi,%rdx), %ecx
24-
; X64-NEXT: movslq %ecx, %rcx
25-
; X64-NEXT: movzwl (%rdi,%rcx), %ecx
26-
; X64-NEXT: shlq $32, %rcx
27-
; X64-NEXT: orq %rax, %rcx
28-
; X64-NEXT: movq %rcx, %xmm0
21+
; X64-NEXT: movq %rax, %xmm0
2922
; X64-NEXT: movd %xmm0, %eax
3023
; X64-NEXT: retq
3124
entry:

llvm/test/CodeGen/X86/movmsk-cmp.ll

Lines changed: 26 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -929,22 +929,6 @@ define i1 @allzeros_v16i32_sign(<16 x i32> %arg) {
929929
define i1 @allones_v4i64_sign(<4 x i64> %arg) {
930930
; SSE2-LABEL: allones_v4i64_sign:
931931
; SSE2: # %bb.0:
932-
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
933-
; SSE2-NEXT: pxor %xmm2, %xmm1
934-
; SSE2-NEXT: movdqa %xmm2, %xmm3
935-
; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
936-
; SSE2-NEXT: movdqa %xmm2, %xmm4
937-
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
938-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
939-
; SSE2-NEXT: pand %xmm3, %xmm1
940-
; SSE2-NEXT: por %xmm4, %xmm1
941-
; SSE2-NEXT: pxor %xmm2, %xmm0
942-
; SSE2-NEXT: movdqa %xmm2, %xmm3
943-
; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
944-
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
945-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
946-
; SSE2-NEXT: pand %xmm3, %xmm0
947-
; SSE2-NEXT: por %xmm2, %xmm0
948932
; SSE2-NEXT: packssdw %xmm1, %xmm0
949933
; SSE2-NEXT: movmskps %xmm0, %eax
950934
; SSE2-NEXT: cmpb $15, %al
@@ -989,22 +973,6 @@ define i1 @allones_v4i64_sign(<4 x i64> %arg) {
989973
define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
990974
; SSE2-LABEL: allzeros_v4i64_sign:
991975
; SSE2: # %bb.0:
992-
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
993-
; SSE2-NEXT: pxor %xmm2, %xmm1
994-
; SSE2-NEXT: movdqa %xmm2, %xmm3
995-
; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
996-
; SSE2-NEXT: movdqa %xmm2, %xmm4
997-
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
998-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
999-
; SSE2-NEXT: pand %xmm3, %xmm1
1000-
; SSE2-NEXT: por %xmm4, %xmm1
1001-
; SSE2-NEXT: pxor %xmm2, %xmm0
1002-
; SSE2-NEXT: movdqa %xmm2, %xmm3
1003-
; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
1004-
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
1005-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
1006-
; SSE2-NEXT: pand %xmm3, %xmm0
1007-
; SSE2-NEXT: por %xmm2, %xmm0
1008976
; SSE2-NEXT: packssdw %xmm1, %xmm0
1009977
; SSE2-NEXT: movmskps %xmm0, %eax
1010978
; SSE2-NEXT: testb %al, %al
@@ -1095,15 +1063,13 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) {
10951063
;
10961064
; AVX1-LABEL: allones_v8i64_sign:
10971065
; AVX1: # %bb.0:
1098-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1099-
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1100-
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1101-
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
1102-
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
11031066
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1067+
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
11041068
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
11051069
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
11061070
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
1071+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1072+
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
11071073
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
11081074
; AVX1-NEXT: vmovmskps %ymm0, %eax
11091075
; AVX1-NEXT: cmpb $-1, %al
@@ -1198,15 +1164,13 @@ define i1 @allzeros_v8i64_sign(<8 x i64> %arg) {
11981164
;
11991165
; AVX1-LABEL: allzeros_v8i64_sign:
12001166
; AVX1: # %bb.0:
1201-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1202-
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1203-
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1204-
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
1205-
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
12061167
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1168+
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
12071169
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
12081170
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
12091171
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
1172+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1173+
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
12101174
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
12111175
; AVX1-NEXT: vmovmskps %ymm0, %eax
12121176
; AVX1-NEXT: testb %al, %al
@@ -2539,19 +2503,17 @@ define i1 @allones_v8i64_and1(<8 x i64> %arg) {
25392503
;
25402504
; AVX1-LABEL: allones_v8i64_and1:
25412505
; AVX1: # %bb.0:
2542-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2543-
; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
2544-
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2545-
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
2546-
; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2547-
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
2548-
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
25492506
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
25502507
; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
2508+
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
25512509
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
25522510
; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
25532511
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
25542512
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
2513+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2514+
; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
2515+
; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2516+
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
25552517
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
25562518
; AVX1-NEXT: vmovmskps %ymm0, %eax
25572519
; AVX1-NEXT: cmpb $-1, %al
@@ -2615,19 +2577,17 @@ define i1 @allzeros_v8i64_and1(<8 x i64> %arg) {
26152577
;
26162578
; AVX1-LABEL: allzeros_v8i64_and1:
26172579
; AVX1: # %bb.0:
2618-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2619-
; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
2620-
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2621-
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
2622-
; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2623-
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
2624-
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
26252580
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
26262581
; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
2582+
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
26272583
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
26282584
; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
26292585
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
26302586
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
2587+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2588+
; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
2589+
; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2590+
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
26312591
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
26322592
; AVX1-NEXT: vmovmskps %ymm0, %eax
26332593
; AVX1-NEXT: testb %al, %al
@@ -3962,19 +3922,17 @@ define i1 @allones_v8i64_and4(<8 x i64> %arg) {
39623922
;
39633923
; AVX1-LABEL: allones_v8i64_and4:
39643924
; AVX1: # %bb.0:
3965-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3966-
; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
3967-
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
3968-
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
3969-
; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
3970-
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
3971-
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
39723925
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
39733926
; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
3927+
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
39743928
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
39753929
; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0
39763930
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
39773931
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
3932+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3933+
; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
3934+
; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
3935+
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
39783936
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
39793937
; AVX1-NEXT: vmovmskps %ymm0, %eax
39803938
; AVX1-NEXT: cmpb $-1, %al
@@ -4038,19 +3996,17 @@ define i1 @allzeros_v8i64_and4(<8 x i64> %arg) {
40383996
;
40393997
; AVX1-LABEL: allzeros_v8i64_and4:
40403998
; AVX1: # %bb.0:
4041-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4042-
; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
4043-
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
4044-
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
4045-
; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
4046-
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
4047-
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
40483999
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
40494000
; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
4001+
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
40504002
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
40514003
; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0
40524004
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
40534005
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
4006+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4007+
; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
4008+
; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
4009+
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
40544010
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
40554011
; AVX1-NEXT: vmovmskps %ymm0, %eax
40564012
; AVX1-NEXT: testb %al, %al
@@ -4170,22 +4126,6 @@ define i32 @movmskps(<4 x float> %x) {
41704126
define i32 @movmskpd256(<4 x double> %x) {
41714127
; SSE2-LABEL: movmskpd256:
41724128
; SSE2: # %bb.0:
4173-
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
4174-
; SSE2-NEXT: pxor %xmm2, %xmm1
4175-
; SSE2-NEXT: movdqa %xmm2, %xmm3
4176-
; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
4177-
; SSE2-NEXT: movdqa %xmm2, %xmm4
4178-
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
4179-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
4180-
; SSE2-NEXT: pand %xmm3, %xmm1
4181-
; SSE2-NEXT: por %xmm4, %xmm1
4182-
; SSE2-NEXT: pxor %xmm2, %xmm0
4183-
; SSE2-NEXT: movdqa %xmm2, %xmm3
4184-
; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
4185-
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
4186-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
4187-
; SSE2-NEXT: pand %xmm3, %xmm0
4188-
; SSE2-NEXT: por %xmm2, %xmm0
41894129
; SSE2-NEXT: packssdw %xmm1, %xmm0
41904130
; SSE2-NEXT: movmskps %xmm0, %eax
41914131
; SSE2-NEXT: retq

0 commit comments

Comments
 (0)