Skip to content

Commit 1a8563d

Browse files
RKSimongithub-actions[bot]
authored andcommitted
Automerge: [X86] fold AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0) to support AVX512 predicated {k}{z} masks (#131788)
We already do this for the ANDNP(SEXT(SETCC()),X) equivalent pattern. Fixes #109272
2 parents 784b961 + b3d280b commit 1a8563d

File tree

4 files changed

+48
-48
lines changed

4 files changed

+48
-48
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51348,6 +51348,8 @@ static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG,
5134851348
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
5134951349
TargetLowering::DAGCombinerInfo &DCI,
5135051350
const X86Subtarget &Subtarget) {
51351+
using namespace SDPatternMatch;
51352+
5135151353
SDValue N0 = N->getOperand(0);
5135251354
SDValue N1 = N->getOperand(1);
5135351355
EVT VT = N->getValueType(0);
@@ -51482,6 +51484,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
5148251484
}
5148351485
}
5148451486

51487+
// On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51488+
// to make use of predicated selects.
51489+
// AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51490+
if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51491+
SDValue X, Y;
51492+
EVT CondVT = VT.changeVectorElementType(MVT::i1);
51493+
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51494+
sd_match(N, m_And(m_Value(X),
51495+
m_OneUse(m_SExt(m_AllOf(
51496+
m_Value(Y), m_SpecificVT(CondVT),
51497+
m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51498+
return DAG.getSelect(dl, VT, Y, X,
51499+
getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51500+
}
51501+
}
51502+
5148551503
// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
5148651504
// avoids slow variable shift (moving shift amount to ECX etc.)
5148751505
if (isOneConstant(N1) && N0->hasOneUse()) {

llvm/test/CodeGen/X86/gfni-lzcnt.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -360,14 +360,12 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
360360
; GFNIAVX512BW-LABEL: testv64i8:
361361
; GFNIAVX512BW: # %bb.0:
362362
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
363+
; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
363364
; GFNIAVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
364365
; GFNIAVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
365-
; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
366-
; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
367-
; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
368-
; GFNIAVX512BW-NEXT: vpmovm2b %k0, %zmm1
369-
; GFNIAVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
370-
; GFNIAVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
366+
; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z}
367+
; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
368+
; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
371369
; GFNIAVX512BW-NEXT: retq
372370
%out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0)
373371
ret <64 x i8> %out
@@ -494,14 +492,12 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
494492
; GFNIAVX512BW-LABEL: testv64i8u:
495493
; GFNIAVX512BW: # %bb.0:
496494
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
495+
; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
497496
; GFNIAVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
498497
; GFNIAVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
499-
; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
500-
; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
501-
; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
502-
; GFNIAVX512BW-NEXT: vpmovm2b %k0, %zmm1
503-
; GFNIAVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
504-
; GFNIAVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
498+
; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z}
499+
; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
500+
; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
505501
; GFNIAVX512BW-NEXT: retq
506502
%out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1)
507503
ret <64 x i8> %out

llvm/test/CodeGen/X86/vector-lzcnt-512.ll

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -369,14 +369,12 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
369369
; AVX512BW: # %bb.0:
370370
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
371371
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
372+
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
372373
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
373374
; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
374-
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
375-
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2
376-
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
377-
; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
378-
; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1
379-
; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1
375+
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm3 {%k1} {z}
376+
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
377+
; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1
380378
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
381379
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
382380
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
@@ -455,14 +453,12 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
455453
; AVX512BW: # %bb.0:
456454
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
457455
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
456+
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
458457
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
459458
; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
460-
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
461-
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2
462-
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
463-
; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
464-
; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1
465-
; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1
459+
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm3 {%k1} {z}
460+
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
461+
; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1
466462
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
467463
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
468464
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
@@ -561,14 +557,12 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
561557
; AVX512BW: # %bb.0:
562558
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
563559
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
560+
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
564561
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
565562
; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
566-
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
567-
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
568-
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
569-
; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
570-
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
571-
; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
563+
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z}
564+
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
565+
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
572566
; AVX512BW-NEXT: retq
573567
;
574568
; AVX512DQ-LABEL: testv64i8:
@@ -651,14 +645,12 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
651645
; AVX512BW: # %bb.0:
652646
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
653647
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
648+
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
654649
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
655650
; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
656-
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
657-
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
658-
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
659-
; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
660-
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
661-
; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
651+
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z}
652+
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
653+
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
662654
; AVX512BW-NEXT: retq
663655
;
664656
; AVX512DQ-LABEL: testv64i8u:

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,8 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64
149149
define <64 x i8> @combine_vpermi2var_v64i8_with_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %a2) {
150150
; CHECK-LABEL: combine_vpermi2var_v64i8_with_mask:
151151
; CHECK: # %bb.0:
152-
; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0
153-
; CHECK-NEXT: vpmovb2m %zmm1, %k0
154-
; CHECK-NEXT: vpmovm2b %k0, %zmm1
155-
; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
152+
; CHECK-NEXT: vpmovb2m %zmm1, %k1
153+
; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0 {%k1} {z}
156154
; CHECK-NEXT: ret{{[l|q]}}
157155
%perm = tail call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %a2)
158156
%cmp = icmp slt <64 x i8> %a1, zeroinitializer
@@ -177,19 +175,15 @@ define <64 x i8> @combine_vpermi2var_constant_v64i8_with_mask(<64 x i8> %a0) {
177175
; X86-LABEL: combine_vpermi2var_constant_v64i8_with_mask:
178176
; X86: # %bb.0:
179177
; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
180-
; X86-NEXT: vpermt2b {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm1
181-
; X86-NEXT: vpmovb2m %zmm0, %k0
182-
; X86-NEXT: vpmovm2b %k0, %zmm0
183-
; X86-NEXT: vpandq %zmm1, %zmm0, %zmm0
178+
; X86-NEXT: vpmovb2m %zmm0, %k1
179+
; X86-NEXT: vpermi2b {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm0 {%k1} {z}
184180
; X86-NEXT: retl
185181
;
186182
; X64-LABEL: combine_vpermi2var_constant_v64i8_with_mask:
187183
; X64: # %bb.0:
188184
; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
189-
; X64-NEXT: vpermt2b {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
190-
; X64-NEXT: vpmovb2m %zmm0, %k0
191-
; X64-NEXT: vpmovm2b %k0, %zmm0
192-
; X64-NEXT: vpandq %zmm1, %zmm0, %zmm0
185+
; X64-NEXT: vpmovb2m %zmm0, %k1
186+
; X64-NEXT: vpermi2b {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 {%k1} {z}
193187
; X64-NEXT: retq
194188
%perm = tail call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>, <64 x i8> %a0, <64 x i8> <i8 64, i8 65, i8 66, i8 67, i8 68, i8 69, i8 70, i8 71, i8 72, i8 73, i8 74, i8 75, i8 76, i8 77, i8 78, i8 79, i8 80, i8 81, i8 82, i8 83, i8 84, i8 85, i8 86, i8 87, i8 88, i8 89, i8 90, i8 91, i8 92, i8 93, i8 94, i8 95, i8 96, i8 97, i8 98, i8 99, i8 100, i8 101, i8 102, i8 103, i8 104, i8 105, i8 106, i8 107, i8 108, i8 109, i8 110, i8 111, i8 112, i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127>)
195189
%cmp = icmp slt <64 x i8> %a0, zeroinitializer

0 commit comments

Comments
 (0)