Skip to content

Commit 8bc2d19

Browse files
committed
[X86] canonicalizeShuffleWithOp - don't fold VPERMI(BINOP(X,Y)) -> BINOP(VPERMI(X),VPERMI(Y))
VPERMI (VPERMQ/PD) is nearly always lane-crossing and poorly merges with target shuffles (other than itself). For now, I've restricted VPERMI to only merge with itself, constants, loads and splats. We might be able to merge with a few other special cases (AND/ANDNP with constant?), which could help the shuffle-vs-trunc-256.ll AVX512VL regression, but since that now gives similar codegen to the other AVX512 variants, I'd prefer to improve the shuffle lowering for that properly.
1 parent 82be6e1 commit 8bc2d19

File tree

4 files changed

+4473
-4860
lines changed

4 files changed

+4473
-4860
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39961,8 +39961,10 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
3996139961
const SDLoc &DL) {
3996239962
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3996339963
EVT ShuffleVT = N.getValueType();
39964+
unsigned Opc = N.getOpcode();
3996439965

39965-
auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
39966+
auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true,
39967+
bool FoldLoad = false) {
3996639968
// AllZeros/AllOnes constants are freely shuffled and will peek through
3996739969
// bitcasts. Other constant build vectors do not peek through bitcasts. Only
3996839970
// merge with target shuffles if it has one use so shuffle combining is
@@ -39972,8 +39974,9 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
3997239974
ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
3997339975
ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
3997439976
getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
39977+
(Op.getOpcode() == Opc && Op->hasOneUse()) ||
3997539978
(Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
39976-
(isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
39979+
(FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
3997739980
(FoldLoad && isShuffleFoldableLoad(Op)) ||
3997839981
DAG.isSplatValue(Op, /*AllowUndefs*/ false);
3997939982
};
@@ -39984,7 +39987,6 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
3998439987
(Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
3998539988
};
3998639989

39987-
unsigned Opc = N.getOpcode();
3998839990
switch (Opc) {
3998939991
// Unary and Unary+Permute Shuffles.
3999039992
case X86ISD::PSHUFB: {
@@ -40010,8 +40012,10 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
4001040012
if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
4001140013
SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
4001240014
SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
40013-
if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
40014-
IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
40015+
if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI,
40016+
Opc != X86ISD::PSHUFB) ||
40017+
IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI,
40018+
Opc != X86ISD::PSHUFB)) {
4001540019
SDValue LHS, RHS;
4001640020
Op00 = DAG.getBitcast(ShuffleVT, Op00);
4001740021
Op01 = DAG.getBitcast(ShuffleVT, Op01);

llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1333,8 +1333,10 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
13331333
; AVX512VL-LABEL: negative:
13341334
; AVX512VL: # %bb.0:
13351335
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1336-
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1337-
; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
1336+
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1337+
; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
1338+
; AVX512VL-NEXT: vpternlogq $206, %ymm1, %ymm0, %ymm2
1339+
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,3,2,3]
13381340
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
13391341
; AVX512VL-NEXT: vzeroupper
13401342
; AVX512VL-NEXT: retq

0 commit comments

Comments
 (0)