-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[VectorCombine] Add free concats to shuffleToIdentity. #94954
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: David Green (davemgreen) ChangesThis is another relatively small adjustment to shuffleToIdentity, which has had a few knock-one effects to need a few more changes. It attempts to detect free concats, that will be legalized to multiple vector operations. For example if the lanes are '[a[0], a[1], b[0], b[1]]' and a and b are v2f64 under aarch64. In order to do this:
Patch is 43.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94954.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e608c7fb60468..0d36e29db7b43 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1669,20 +1669,20 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
return true;
}
-using InstLane = std::pair<Value *, int>;
+using InstLane = std::pair<Use *, int>;
-static InstLane lookThroughShuffles(Value *V, int Lane) {
- while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+static InstLane lookThroughShuffles(Use *V, int Lane) {
+ while (auto *SV = dyn_cast<ShuffleVectorInst>(V->get())) {
unsigned NumElts =
cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
int M = SV->getMaskValue(Lane);
if (M < 0)
return {nullptr, PoisonMaskElem};
if (static_cast<unsigned>(M) < NumElts) {
- V = SV->getOperand(0);
+ V = &SV->getOperandUse(0);
Lane = M;
} else {
- V = SV->getOperand(1);
+ V = &SV->getOperandUse(1);
Lane = M - NumElts;
}
}
@@ -1695,37 +1695,83 @@ generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) {
for (InstLane IL : Item) {
auto [V, Lane] = IL;
InstLane OpLane =
- V ? lookThroughShuffles(cast<Instruction>(V)->getOperand(Op), Lane)
+ V ? lookThroughShuffles(&cast<Instruction>(V->get())->getOperandUse(Op),
+ Lane)
: InstLane{nullptr, PoisonMaskElem};
NItem.emplace_back(OpLane);
}
return NItem;
}
+/// Detect concat of multiple values into a vector
+static bool isFreeConcat(ArrayRef<InstLane> Item,
+ const TargetTransformInfo &TTI) {
+ auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
+ unsigned NumElts = Ty->getNumElements();
+ if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
+ return false;
+
+ // Check that the concat is free, usually meaning that the type will be split
+ // during legalization.
+ SmallVector<int, 16> ConcatMask(Ty->getNumElements() * 2);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask,
+ TTI::TCK_RecipThroughput) != 0)
+ return false;
+
+ unsigned NumSlices = Item.size() / NumElts;
+ // Currently we generate a tree of shuffles for the concats, which limits us
+ // to a power2.
+ if (!isPowerOf2_32(NumSlices))
+ return false;
+ for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
+ Use *SliceV = Item[Slice * NumElts].first;
+ if (!SliceV || SliceV->get()->getType() != Ty)
+ return false;
+ for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
+ auto [V, Lane] = Item[Slice * NumElts + Elt];
+ if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
+ return false;
+ }
+ }
+ return true;
+}
+
static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
- const SmallPtrSet<Value *, 4> &IdentityLeafs,
- const SmallPtrSet<Value *, 4> &SplatLeafs,
+ const SmallPtrSet<Use *, 4> &IdentityLeafs,
+ const SmallPtrSet<Use *, 4> &SplatLeafs,
+ const SmallPtrSet<Use *, 4> &ConcatLeafs,
IRBuilder<> &Builder) {
auto [FrontV, FrontLane] = Item.front();
- if (IdentityLeafs.contains(FrontV) &&
- all_of(drop_begin(enumerate(Item)), [Item](const auto &E) {
- Value *FrontV = Item.front().first;
- auto [V, Lane] = E.value();
- return !V || (V == FrontV && Lane == (int)E.index());
- })) {
- return FrontV;
+ if (IdentityLeafs.contains(FrontV)) {
+ return FrontV->get();
}
if (SplatLeafs.contains(FrontV)) {
- if (auto *ILI = dyn_cast<Instruction>(FrontV))
- Builder.SetInsertPoint(*ILI->getInsertionPointAfterDef());
- else if (auto *Arg = dyn_cast<Argument>(FrontV))
- Builder.SetInsertPointPastAllocas(Arg->getParent());
SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
- return Builder.CreateShuffleVector(FrontV, Mask);
+ return Builder.CreateShuffleVector(FrontV->get(), Mask);
+ }
+ if (ConcatLeafs.contains(FrontV)) {
+ unsigned NumElts =
+ cast<FixedVectorType>(FrontV->get()->getType())->getNumElements();
+ SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
+ for (unsigned S = 0; S < Values.size(); ++S)
+ Values[S] = Item[S * NumElts].first->get();
+
+ while (Values.size() > 1) {
+ NumElts *= 2;
+ SmallVector<int, 16> Mask(NumElts, 0);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
+ for (unsigned S = 0; S < NewValues.size(); ++S)
+ NewValues[S] =
+ Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
+ Values = NewValues;
+ }
+ return Values[0];
}
- auto *I = cast<Instruction>(FrontV);
+ auto *I = cast<Instruction>(FrontV->get());
auto *II = dyn_cast<IntrinsicInst>(I);
unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
SmallVector<Value *> Ops(NumOps);
@@ -1734,16 +1780,16 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
Ops[Idx] = II->getOperand(Idx);
continue;
}
- Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx),
- Ty, IdentityLeafs, SplatLeafs, Builder);
+ Ops[Idx] =
+ generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), Ty,
+ IdentityLeafs, SplatLeafs, ConcatLeafs, Builder);
}
SmallVector<Value *, 8> ValueList;
for (const auto &Lane : Item)
if (Lane.first)
- ValueList.push_back(Lane.first);
+ ValueList.push_back(Lane.first->get());
- Builder.SetInsertPoint(I);
Type *DstTy =
FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
if (auto *BI = dyn_cast<BinaryOperator>(I)) {
@@ -1785,16 +1831,16 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
// do so.
bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
auto *Ty = dyn_cast<FixedVectorType>(I.getType());
- if (!Ty)
+ if (!Ty || I.use_empty())
return false;
SmallVector<InstLane> Start(Ty->getNumElements());
for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
- Start[M] = lookThroughShuffles(&I, M);
+ Start[M] = lookThroughShuffles(&*I.use_begin(), M);
SmallVector<SmallVector<InstLane>> Worklist;
Worklist.push_back(Start);
- SmallPtrSet<Value *, 4> IdentityLeafs, SplatLeafs;
+ SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
unsigned NumVisited = 0;
while (!Worklist.empty()) {
@@ -1809,12 +1855,12 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
return false;
// Look for an identity value.
- if (!FrontLane &&
- cast<FixedVectorType>(FrontV->getType())->getNumElements() ==
+ if (FrontLane == 0 &&
+ cast<FixedVectorType>(FrontV->get()->getType())->getNumElements() ==
Ty->getNumElements() &&
all_of(drop_begin(enumerate(Item)), [Item](const auto &E) {
- Value *FrontV = Item.front().first;
- return !E.value().first || (E.value().first == FrontV &&
+ Value *FrontV = Item.front().first->get();
+ return !E.value().first || (E.value().first->get() == FrontV &&
E.value().second == (int)E.index());
})) {
IdentityLeafs.insert(FrontV);
@@ -1824,9 +1870,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (auto *C = dyn_cast<Constant>(FrontV);
C && C->getSplatValue() &&
all_of(drop_begin(Item), [Item](InstLane &IL) {
- Value *FrontV = Item.front().first;
- Value *V = IL.first;
- return !V || V == FrontV;
+ Value *FrontV = Item.front().first->get();
+ Use *V = IL.first;
+ return !V || V->get() == FrontV;
})) {
SplatLeafs.insert(FrontV);
continue;
@@ -1835,7 +1881,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (all_of(drop_begin(Item), [Item](InstLane &IL) {
auto [FrontV, FrontLane] = Item.front();
auto [V, Lane] = IL;
- return !V || (V == FrontV && Lane == FrontLane);
+ return !V || (V->get() == FrontV->get() && Lane == FrontLane);
})) {
SplatLeafs.insert(FrontV);
continue;
@@ -1843,11 +1889,11 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
// We need each element to be the same type of value, and check that each
// element has a single use.
- if (!all_of(drop_begin(Item), [Item](InstLane IL) {
- Value *FrontV = Item.front().first;
- Value *V = IL.first;
- if (!V)
+ if (all_of(drop_begin(Item), [Item](InstLane IL) {
+ Value *FrontV = Item.front().first->get();
+ if (!IL.first)
return true;
+ Value *V = IL.first->get();
if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUse())
return false;
if (V->getValueID() != FrontV->getValueID())
@@ -1864,40 +1910,49 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
return !II || (isa<IntrinsicInst>(FrontV) &&
II->getIntrinsicID() ==
cast<IntrinsicInst>(FrontV)->getIntrinsicID());
- }))
- return false;
-
- // Check the operator is one that we support. We exclude div/rem in case
- // they hit UB from poison lanes.
- if ((isa<BinaryOperator>(FrontV) &&
- !cast<BinaryOperator>(FrontV)->isIntDivRem()) ||
- isa<CmpInst>(FrontV)) {
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
- } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontV)) {
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
- } else if (isa<SelectInst>(FrontV)) {
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2));
- } else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
- II && isTriviallyVectorizable(II->getIntrinsicID())) {
- for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
- if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
- if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
- Value *FrontV = Item.front().first;
- Value *V = IL.first;
- return !V || (cast<Instruction>(V)->getOperand(Op) ==
- cast<Instruction>(FrontV)->getOperand(Op));
- }))
- return false;
- continue;
+ })) {
+ // Check the operator is one that we support.
+ if (isa<BinaryOperator, CmpInst>(FrontV)) {
+ // We exclude div/rem in case they hit UB from poison lanes.
+ if (auto *BO = dyn_cast<BinaryOperator>(FrontV);
+ BO && BO->isIntDivRem())
+ return false;
+ Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
+ Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
+ continue;
+ } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontV)) {
+ Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
+ continue;
+ } else if (isa<SelectInst>(FrontV)) {
+ Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
+ Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
+ Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2));
+ continue;
+ } else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
+ II && isTriviallyVectorizable(II->getIntrinsicID())) {
+ for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
+ if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
+ if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
+ Value *FrontV = Item.front().first->get();
+ Value *V = IL.first->get();
+ return !V || (cast<Instruction>(V)->getOperand(Op) ==
+ cast<Instruction>(FrontV)->getOperand(Op));
+ }))
+ return false;
+ continue;
+ }
+ Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op));
}
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op));
+ continue;
}
- } else {
- return false;
}
+
+ if (isFreeConcat(Item, TTI)) {
+ ConcatLeafs.insert(FrontV);
+ continue;
+ }
+
+ return false;
}
if (NumVisited <= 1)
@@ -1905,7 +1960,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
// If we got this far, we know the shuffles are superfluous and can be
// removed. Scan through again and generate the new tree of instructions.
- Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs, Builder);
+ Builder.SetInsertPoint(&I);
+ Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
+ ConcatLeafs, Builder);
replaceValue(I, *V);
return true;
}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
index c085e10c049a9..3ee8ba5d09ed1 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
@@ -22,9 +22,9 @@ define void @add4(ptr noalias noundef %x, ptr noalias noundef %y, i32 noundef %n
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP2]]
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -403,12 +403,12 @@ define void @addmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z, i32
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Z:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_VEC31:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
-; CHECK-NEXT: [[TMP2:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP4:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[TMP4]], [[WIDE_VEC36]]
; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll
index d72532963d12a..7aba1bbb1c9a0 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll
@@ -82,15 +82,9 @@ define <8 x i8> @concata_addmul_small(<4 x i8> %a1, <4 x i8> %a2, <8 x i8> %b, <
define <8 x i32> @concata_addmul_big(<4 x i32> %a1, <4 x i32> %a2, <8 x i32> %b, <8 x i32> %c) {
; CHECK-LABEL: @concata_addmul_big(
-; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[CB:%.*]] = shufflevector <8 x i32> [[C:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[CT:%.*]] = shufflevector <8 x i32> [[C]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[XB:%.*]] = mul <4 x i32> [[A1:%.*]], [[BB]]
-; CHECK-NEXT: [[XT:%.*]] = mul <4 x i32> [[A2:%.*]], [[BT]]
-; CHECK-NEXT: [[YB:%.*]] = add <4 x i32> [[XB]], [[CB]]
-; CHECK-NEXT: [[YT:%.*]] = add <4 x i32> [[XT]], [[CT]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[YB]], <4 x i32> [[YT]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[B:%.*]]
+; CHECK-NEXT: [[R:%.*]] = add <8 x i32> [[TMP2]], [[C:%.*]]
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%bb = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -107,29 +101,11 @@ define <8 x i32> @concata_addmul_big(<4 x i32> %a1, <4 x i32> %a2, <8 x i32> %b,
define <16 x i32> @concata_addmul_bigger(<4 x i32> %a1a, <4 x i32> %a2a, <4 x i32> %a3a, <4 x i32> %a4a, <16 x i32> %b, <16 x i32> %c) {
; CHECK-LABEL: @concata_addmul_bigger(
-; CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i32> [[A1A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[A2:%.*]] = shufflevector <4 x i32> [[A2A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[A3:%.*]] = shufflevector <4 x i32> [[A3A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[A4:%.*]] = shufflevector <4 x i32> [[A4A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[B1:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <4 x i32> <i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT: [[B2:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT: [[B3:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[B4:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[C1:%.*]] = shufflevector <16 x i32> [[C:%.*]], <16 x i32> poison, <4 x i32> <i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT: [[C2:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT: [[C3:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[C4:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[X1:%.*]] = mul <4 x i32> [[A1]], [[B1]]
-; CHECK-NEXT: [[X2:%.*]] = mul <4 x i32> [[A2]], [[B2]]
-; CHECK-NEXT: [[X3:%.*]] = mul <4 x i32> [[A3]], [[B3]]
-; CHECK-NEXT: [[X4:%.*]] = mul <4 x i32> [[A4]], [[B4]]
-; CHECK-NEXT: [[Y1:%.*]] = add <4 x i32> [[X1]], [[C1]]
-; CHECK-NEXT: [[Y2:%.*]] =...
[truncated]
|
When looking up through shuffles, a Value can be multiple different leaf types (for example an identity from one position, a splat from another). We currently detect this by recalculating which type of leaf it is when generating, but as more types of leafs are added (#94954) this doesn't scale very well. This patch switches it to use Use, not Value, to more accurately detect which type of leaf each Use should have.
0dc6388
to
003f56b
Compare
Ping. I know there is talk of making other vectorizers perform re-vectorization, but I would like to get this in if we can. It helps fix a regression, Thanks. |
Ping. Thanks |
Sorry I thought this was related to the costmodel PR - I will take a look soon |
|
||
// Check that the concat is free, usually meaning that the type will be split | ||
// during legalization. | ||
SmallVector<int, 16> ConcatMask(Ty->getNumElements() * 2); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NumElts * 2
- but why hardcode to 2?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The shuffle costs are limited to 2 inputs, so it is measuring the cost of the first step in the tree it needs to create, and assuming that if that is free the whole thing will be free. Does that sound OK or do you think it should be measuring different levels?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that should be OK.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for taking a look. The cost-model issue I have not heard back from, so I believe / am hoping it was fixed by one of the backend patches.
|
||
// Check that the concat is free, usually meaning that the type will be split | ||
// during legalization. | ||
SmallVector<int, 16> ConcatMask(Ty->getNumElements() * 2); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The shuffle costs are limited to 2 inputs, so it is measuring the cost of the first step in the tree it needs to create, and assuming that if that is free the whole thing will be free. Does that sound OK or do you think it should be measuring different levels?
This is another relatively small adjustment to shuffleToIdentity, which has had a few knock-one effects to need a few more changes. It attempts to detect free concats, that will be legalized to multiple vector operations. For example if the lanes are '[a[0], a[1], b[0], b[1]]' and a and b are v2f64 under aarch64. In order to do this: - isFreeConcat detects whether the input has piece-wise identities that can become a concat. - A tree of concat shuffles is created to concatenate the input values into a single vector. This is a little different to most other inputs as there are created from multiple values that are being combines together, and we cannot rely on the Lane0 insert location always being valid. - The insert location is changed to the original location instead of updating per item, which ensure it is valid due to the order that we visit and create items.
003f56b
to
63dd2fa
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This is another relatively small adjustment to shuffleToIdentity, which has had a few knock-one effects to need a few more changes. It attempts to detect free concats, that will be legalized to multiple vector operations. For example if the lanes are '[a[0], a[1], b[0], b[1]]' and a and b are v2f64 under aarch64. In order to do this: - isFreeConcat detects whether the input has piece-wise identities from multiple inputs that can become a concat. - A tree of concat shuffles is created to concatenate the input values into a single vector. This is a little different to most other inputs as there are created from multiple values that are being combined together, and we cannot rely on the Lane0 insert location always being valid. - The insert location is changed to the original location instead of updating per item, which ensure it is valid due to the order that we visit and create items.
This is another relatively small adjustment to shuffleToIdentity, which has had a few knock-one effects to need a few more changes. It attempts to detect free concats, that will be legalized to multiple vector operations. For example if the lanes are '[a[0], a[1], b[0], b[1]]' and a and b are v2f64 under aarch64.
In order to do this: