@@ -39602,7 +39602,7 @@ static bool matchBinaryPermuteShuffle(
39602
39602
static SDValue combineX86ShuffleChainWithExtract(
39603
39603
ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39604
39604
ArrayRef<const SDNode *> SrcNodes, bool AllowVariableCrossLaneMask,
39605
- bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39605
+ bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
39606
39606
const X86Subtarget &Subtarget);
39607
39607
39608
39608
/// Combine an arbitrary chain of shuffles into a single instruction if
@@ -39619,6 +39619,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
39619
39619
ArrayRef<const SDNode *> SrcNodes,
39620
39620
bool AllowVariableCrossLaneMask,
39621
39621
bool AllowVariablePerLaneMask,
39622
+ bool IsMaskedShuffle,
39622
39623
SelectionDAG &DAG,
39623
39624
const X86Subtarget &Subtarget) {
39624
39625
assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
@@ -39666,17 +39667,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
39666
39667
(RootVT.isFloatingPoint() && Depth >= 1) ||
39667
39668
(RootVT.is256BitVector() && !Subtarget.hasAVX2());
39668
39669
39669
- // Don't combine if we are a AVX512/EVEX target and the mask element size
39670
- // is different from the root element size - this would prevent writemasks
39671
- // from being reused.
39672
- bool IsMaskedShuffle = false;
39673
- if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
39674
- if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
39675
- Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39676
- IsMaskedShuffle = true;
39677
- }
39678
- }
39679
-
39680
39670
// If we are shuffling a splat (and not introducing zeros) then we can just
39681
39671
// use it directly. This works for smaller elements as well as they already
39682
39672
// repeat across each mask element.
@@ -40167,7 +40157,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
40167
40157
// shuffle with the larger type.
40168
40158
if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40169
40159
Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask,
40170
- AllowVariablePerLaneMask, DAG, Subtarget))
40160
+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget))
40171
40161
return WideShuffle;
40172
40162
40173
40163
// If we have a dual input lane-crossing shuffle then lower to VPERMV3,
@@ -40339,7 +40329,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
40339
40329
// shuffle with the larger type.
40340
40330
if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40341
40331
Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask,
40342
- AllowVariablePerLaneMask, DAG, Subtarget))
40332
+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget))
40343
40333
return WideShuffle;
40344
40334
40345
40335
// If we have a dual input shuffle then lower to VPERMV3,
@@ -40378,7 +40368,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
40378
40368
static SDValue combineX86ShuffleChainWithExtract(
40379
40369
ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40380
40370
ArrayRef<const SDNode *> SrcNodes, bool AllowVariableCrossLaneMask,
40381
- bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40371
+ bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
40382
40372
const X86Subtarget &Subtarget) {
40383
40373
unsigned NumMaskElts = BaseMask.size();
40384
40374
unsigned NumInputs = Inputs.size();
@@ -40504,10 +40494,10 @@ static SDValue combineX86ShuffleChainWithExtract(
40504
40494
assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40505
40495
"WideRootSize mismatch");
40506
40496
40507
- if (SDValue WideShuffle =
40508
- combineX86ShuffleChain( WideInputs, WideRoot, WideMask, Depth,
40509
- SrcNodes, AllowVariableCrossLaneMask ,
40510
- AllowVariablePerLaneMask, DAG, Subtarget)) {
40497
+ if (SDValue WideShuffle = combineX86ShuffleChain(
40498
+ WideInputs, WideRoot, WideMask, Depth, SrcNodes ,
40499
+ AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle ,
40500
+ DAG, Subtarget)) {
40511
40501
WideShuffle =
40512
40502
extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
40513
40503
return DAG.getBitcast(RootVT, WideShuffle);
@@ -41244,6 +41234,16 @@ static SDValue combineX86ShufflesRecursively(
41244
41234
resolveTargetShuffleInputsAndMask(Ops, Mask);
41245
41235
}
41246
41236
41237
+ // If we are a AVX512/EVEX target the mask element size should match the root
41238
+ // element size to allow writemasks to be reused.
41239
+ bool IsMaskedShuffle = false;
41240
+ if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
41241
+ if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
41242
+ Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
41243
+ IsMaskedShuffle = true;
41244
+ }
41245
+ }
41246
+
41247
41247
// We can only combine unary and binary shuffle mask cases.
41248
41248
if (Ops.size() <= 2) {
41249
41249
// Minor canonicalization of the accumulated shuffle mask to make it easier
@@ -41268,7 +41268,7 @@ static SDValue combineX86ShufflesRecursively(
41268
41268
// Try to combine into a single shuffle instruction.
41269
41269
if (SDValue Shuffle = combineX86ShuffleChain(
41270
41270
Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask,
41271
- AllowVariablePerLaneMask, DAG, Subtarget))
41271
+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget))
41272
41272
return Shuffle;
41273
41273
41274
41274
// If all the operands come from the same larger vector, fallthrough and try
@@ -41287,7 +41287,7 @@ static SDValue combineX86ShufflesRecursively(
41287
41287
// shuffle with the larger type.
41288
41288
return combineX86ShuffleChainWithExtract(
41289
41289
Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask,
41290
- AllowVariablePerLaneMask, DAG, Subtarget);
41290
+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget);
41291
41291
}
41292
41292
41293
41293
/// Helper entry wrapper to combineX86ShufflesRecursively.
0 commit comments