Skip to content

Commit 0dc6388

Browse files
committed
[VectorCombine] Add free concats to shuffleToIdentity.
This is another relatively small adjustment to shuffleToIdentity, which has had a few knock-one effects to need a few more changes. It attempts to detect free concats, that will be legalized to multiple vector operations. For example if the lanes are '[a[0], a[1], b[0], b[1]]' and a and b are v2f64 under aarch64. In order to do this: - isFreeConcat detects whether the input has piece-wise identities that can become a concat. - A tree of concat shuffles is created to concatenate the input values into a single vector. This is a little different to most other inputs as there are created from multiple values that are being combines together, and we cannot rely on the Lane0 insert location always being valid. - The insert location is changed to the original location instead of updating per item, which ensure it is valid due to the order that we visit and create items. - As with splats/identities, an input value could both be a concat and a splat. To make the creation simpler the Items have been changed to be the Use, not Value, to help better identify values more reliably without needing to recompute whether they are identites/splatsi/concats.
1 parent d1b5a4b commit 0dc6388

File tree

4 files changed

+189
-237
lines changed

4 files changed

+189
-237
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 129 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1669,20 +1669,20 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
16691669
return true;
16701670
}
16711671

1672-
using InstLane = std::pair<Value *, int>;
1672+
using InstLane = std::pair<Use *, int>;
16731673

1674-
static InstLane lookThroughShuffles(Value *V, int Lane) {
1675-
while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
1674+
static InstLane lookThroughShuffles(Use *V, int Lane) {
1675+
while (auto *SV = dyn_cast<ShuffleVectorInst>(V->get())) {
16761676
unsigned NumElts =
16771677
cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
16781678
int M = SV->getMaskValue(Lane);
16791679
if (M < 0)
16801680
return {nullptr, PoisonMaskElem};
16811681
if (static_cast<unsigned>(M) < NumElts) {
1682-
V = SV->getOperand(0);
1682+
V = &SV->getOperandUse(0);
16831683
Lane = M;
16841684
} else {
1685-
V = SV->getOperand(1);
1685+
V = &SV->getOperandUse(1);
16861686
Lane = M - NumElts;
16871687
}
16881688
}
@@ -1695,37 +1695,83 @@ generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) {
16951695
for (InstLane IL : Item) {
16961696
auto [V, Lane] = IL;
16971697
InstLane OpLane =
1698-
V ? lookThroughShuffles(cast<Instruction>(V)->getOperand(Op), Lane)
1698+
V ? lookThroughShuffles(&cast<Instruction>(V->get())->getOperandUse(Op),
1699+
Lane)
16991700
: InstLane{nullptr, PoisonMaskElem};
17001701
NItem.emplace_back(OpLane);
17011702
}
17021703
return NItem;
17031704
}
17041705

1706+
/// Detect concat of multiple values into a vector
1707+
static bool isFreeConcat(ArrayRef<InstLane> Item,
1708+
const TargetTransformInfo &TTI) {
1709+
auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
1710+
unsigned NumElts = Ty->getNumElements();
1711+
if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
1712+
return false;
1713+
1714+
// Check that the concat is free, usually meaning that the type will be split
1715+
// during legalization.
1716+
SmallVector<int, 16> ConcatMask(Ty->getNumElements() * 2);
1717+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
1718+
if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask,
1719+
TTI::TCK_RecipThroughput) != 0)
1720+
return false;
1721+
1722+
unsigned NumSlices = Item.size() / NumElts;
1723+
// Currently we generate a tree of shuffles for the concats, which limits us
1724+
// to a power2.
1725+
if (!isPowerOf2_32(NumSlices))
1726+
return false;
1727+
for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
1728+
Use *SliceV = Item[Slice * NumElts].first;
1729+
if (!SliceV || SliceV->get()->getType() != Ty)
1730+
return false;
1731+
for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
1732+
auto [V, Lane] = Item[Slice * NumElts + Elt];
1733+
if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
1734+
return false;
1735+
}
1736+
}
1737+
return true;
1738+
}
1739+
17051740
static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
1706-
const SmallPtrSet<Value *, 4> &IdentityLeafs,
1707-
const SmallPtrSet<Value *, 4> &SplatLeafs,
1741+
const SmallPtrSet<Use *, 4> &IdentityLeafs,
1742+
const SmallPtrSet<Use *, 4> &SplatLeafs,
1743+
const SmallPtrSet<Use *, 4> &ConcatLeafs,
17081744
IRBuilder<> &Builder) {
17091745
auto [FrontV, FrontLane] = Item.front();
17101746

1711-
if (IdentityLeafs.contains(FrontV) &&
1712-
all_of(drop_begin(enumerate(Item)), [Item](const auto &E) {
1713-
Value *FrontV = Item.front().first;
1714-
auto [V, Lane] = E.value();
1715-
return !V || (V == FrontV && Lane == (int)E.index());
1716-
})) {
1717-
return FrontV;
1747+
if (IdentityLeafs.contains(FrontV)) {
1748+
return FrontV->get();
17181749
}
17191750
if (SplatLeafs.contains(FrontV)) {
1720-
if (auto *ILI = dyn_cast<Instruction>(FrontV))
1721-
Builder.SetInsertPoint(*ILI->getInsertionPointAfterDef());
1722-
else if (auto *Arg = dyn_cast<Argument>(FrontV))
1723-
Builder.SetInsertPointPastAllocas(Arg->getParent());
17241751
SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
1725-
return Builder.CreateShuffleVector(FrontV, Mask);
1752+
return Builder.CreateShuffleVector(FrontV->get(), Mask);
1753+
}
1754+
if (ConcatLeafs.contains(FrontV)) {
1755+
unsigned NumElts =
1756+
cast<FixedVectorType>(FrontV->get()->getType())->getNumElements();
1757+
SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
1758+
for (unsigned S = 0; S < Values.size(); ++S)
1759+
Values[S] = Item[S * NumElts].first->get();
1760+
1761+
while (Values.size() > 1) {
1762+
NumElts *= 2;
1763+
SmallVector<int, 16> Mask(NumElts, 0);
1764+
std::iota(Mask.begin(), Mask.end(), 0);
1765+
SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
1766+
for (unsigned S = 0; S < NewValues.size(); ++S)
1767+
NewValues[S] =
1768+
Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
1769+
Values = NewValues;
1770+
}
1771+
return Values[0];
17261772
}
17271773

1728-
auto *I = cast<Instruction>(FrontV);
1774+
auto *I = cast<Instruction>(FrontV->get());
17291775
auto *II = dyn_cast<IntrinsicInst>(I);
17301776
unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
17311777
SmallVector<Value *> Ops(NumOps);
@@ -1734,16 +1780,16 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
17341780
Ops[Idx] = II->getOperand(Idx);
17351781
continue;
17361782
}
1737-
Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx),
1738-
Ty, IdentityLeafs, SplatLeafs, Builder);
1783+
Ops[Idx] =
1784+
generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), Ty,
1785+
IdentityLeafs, SplatLeafs, ConcatLeafs, Builder);
17391786
}
17401787

17411788
SmallVector<Value *, 8> ValueList;
17421789
for (const auto &Lane : Item)
17431790
if (Lane.first)
1744-
ValueList.push_back(Lane.first);
1791+
ValueList.push_back(Lane.first->get());
17451792

1746-
Builder.SetInsertPoint(I);
17471793
Type *DstTy =
17481794
FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
17491795
if (auto *BI = dyn_cast<BinaryOperator>(I)) {
@@ -1785,16 +1831,16 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
17851831
// do so.
17861832
bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
17871833
auto *Ty = dyn_cast<FixedVectorType>(I.getType());
1788-
if (!Ty)
1834+
if (!Ty || I.use_empty())
17891835
return false;
17901836

17911837
SmallVector<InstLane> Start(Ty->getNumElements());
17921838
for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
1793-
Start[M] = lookThroughShuffles(&I, M);
1839+
Start[M] = lookThroughShuffles(&*I.use_begin(), M);
17941840

17951841
SmallVector<SmallVector<InstLane>> Worklist;
17961842
Worklist.push_back(Start);
1797-
SmallPtrSet<Value *, 4> IdentityLeafs, SplatLeafs;
1843+
SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
17981844
unsigned NumVisited = 0;
17991845

18001846
while (!Worklist.empty()) {
@@ -1809,12 +1855,12 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
18091855
return false;
18101856

18111857
// Look for an identity value.
1812-
if (!FrontLane &&
1813-
cast<FixedVectorType>(FrontV->getType())->getNumElements() ==
1858+
if (FrontLane == 0 &&
1859+
cast<FixedVectorType>(FrontV->get()->getType())->getNumElements() ==
18141860
Ty->getNumElements() &&
18151861
all_of(drop_begin(enumerate(Item)), [Item](const auto &E) {
1816-
Value *FrontV = Item.front().first;
1817-
return !E.value().first || (E.value().first == FrontV &&
1862+
Value *FrontV = Item.front().first->get();
1863+
return !E.value().first || (E.value().first->get() == FrontV &&
18181864
E.value().second == (int)E.index());
18191865
})) {
18201866
IdentityLeafs.insert(FrontV);
@@ -1824,9 +1870,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
18241870
if (auto *C = dyn_cast<Constant>(FrontV);
18251871
C && C->getSplatValue() &&
18261872
all_of(drop_begin(Item), [Item](InstLane &IL) {
1827-
Value *FrontV = Item.front().first;
1828-
Value *V = IL.first;
1829-
return !V || V == FrontV;
1873+
Value *FrontV = Item.front().first->get();
1874+
Use *V = IL.first;
1875+
return !V || V->get() == FrontV;
18301876
})) {
18311877
SplatLeafs.insert(FrontV);
18321878
continue;
@@ -1835,19 +1881,19 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
18351881
if (all_of(drop_begin(Item), [Item](InstLane &IL) {
18361882
auto [FrontV, FrontLane] = Item.front();
18371883
auto [V, Lane] = IL;
1838-
return !V || (V == FrontV && Lane == FrontLane);
1884+
return !V || (V->get() == FrontV->get() && Lane == FrontLane);
18391885
})) {
18401886
SplatLeafs.insert(FrontV);
18411887
continue;
18421888
}
18431889

18441890
// We need each element to be the same type of value, and check that each
18451891
// element has a single use.
1846-
if (!all_of(drop_begin(Item), [Item](InstLane IL) {
1847-
Value *FrontV = Item.front().first;
1848-
Value *V = IL.first;
1849-
if (!V)
1892+
if (all_of(drop_begin(Item), [Item](InstLane IL) {
1893+
Value *FrontV = Item.front().first->get();
1894+
if (!IL.first)
18501895
return true;
1896+
Value *V = IL.first->get();
18511897
if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUse())
18521898
return false;
18531899
if (V->getValueID() != FrontV->getValueID())
@@ -1864,48 +1910,59 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
18641910
return !II || (isa<IntrinsicInst>(FrontV) &&
18651911
II->getIntrinsicID() ==
18661912
cast<IntrinsicInst>(FrontV)->getIntrinsicID());
1867-
}))
1868-
return false;
1869-
1870-
// Check the operator is one that we support. We exclude div/rem in case
1871-
// they hit UB from poison lanes.
1872-
if ((isa<BinaryOperator>(FrontV) &&
1873-
!cast<BinaryOperator>(FrontV)->isIntDivRem()) ||
1874-
isa<CmpInst>(FrontV)) {
1875-
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
1876-
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
1877-
} else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontV)) {
1878-
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
1879-
} else if (isa<SelectInst>(FrontV)) {
1880-
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
1881-
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
1882-
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2));
1883-
} else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
1884-
II && isTriviallyVectorizable(II->getIntrinsicID())) {
1885-
for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
1886-
if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
1887-
if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
1888-
Value *FrontV = Item.front().first;
1889-
Value *V = IL.first;
1890-
return !V || (cast<Instruction>(V)->getOperand(Op) ==
1891-
cast<Instruction>(FrontV)->getOperand(Op));
1892-
}))
1893-
return false;
1894-
continue;
1913+
})) {
1914+
// Check the operator is one that we support.
1915+
if (isa<BinaryOperator, CmpInst>(FrontV)) {
1916+
// We exclude div/rem in case they hit UB from poison lanes.
1917+
if (auto *BO = dyn_cast<BinaryOperator>(FrontV);
1918+
BO && BO->isIntDivRem())
1919+
return false;
1920+
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
1921+
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
1922+
continue;
1923+
} else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontV)) {
1924+
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
1925+
continue;
1926+
} else if (isa<SelectInst>(FrontV)) {
1927+
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
1928+
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
1929+
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2));
1930+
continue;
1931+
} else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
1932+
II && isTriviallyVectorizable(II->getIntrinsicID())) {
1933+
for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
1934+
if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
1935+
if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
1936+
Value *FrontV = Item.front().first->get();
1937+
Value *V = IL.first->get();
1938+
return !V || (cast<Instruction>(V)->getOperand(Op) ==
1939+
cast<Instruction>(FrontV)->getOperand(Op));
1940+
}))
1941+
return false;
1942+
continue;
1943+
}
1944+
Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op));
18951945
}
1896-
Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op));
1946+
continue;
18971947
}
1898-
} else {
1899-
return false;
19001948
}
1949+
1950+
if (isFreeConcat(Item, TTI)) {
1951+
ConcatLeafs.insert(FrontV);
1952+
continue;
1953+
}
1954+
1955+
return false;
19011956
}
19021957

19031958
if (NumVisited <= 1)
19041959
return false;
19051960

19061961
// If we got this far, we know the shuffles are superfluous and can be
19071962
// removed. Scan through again and generate the new tree of instructions.
1908-
Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs, Builder);
1963+
Builder.SetInsertPoint(&I);
1964+
Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
1965+
ConcatLeafs, Builder);
19091966
replaceValue(I, *V);
19101967
return true;
19111968
}

llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ define void @add4(ptr noalias noundef %x, ptr noalias noundef %y, i32 noundef %n
2222
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
2323
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
2424
; CHECK-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
25-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
2625
; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
2726
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP2]]
27+
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
2828
; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
2929
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
3030
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -403,12 +403,12 @@ define void @addmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z, i32
403403
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
404404
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Z:%.*]], i64 [[OFFSET_IDX]]
405405
; CHECK-NEXT: [[WIDE_VEC31:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
406-
; CHECK-NEXT: [[TMP2:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
407-
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
408-
; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
409-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
410-
; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
411-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP4]]
406+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
407+
; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP2]], align 2
408+
; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
409+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP3]]
410+
; CHECK-NEXT: [[TMP4:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
411+
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[TMP4]], [[WIDE_VEC36]]
412412
; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
413413
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
414414
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256

0 commit comments

Comments
 (0)