Skip to content

Commit d6fde91

Browse files
committed
[SLP]Add detection of shuffled/perfect matching of tree entries.
SLP supports perfect diamond matching for the vectorized tree entries but do not support it for gathered entries and does not support non-perfect (shuffled) matching with 1 or 2 tree entries. Patch adds support for this matching to improve cost of the vectorized tree. Differential Revision: https://reviews.llvm.org/D100495
1 parent ca8eef7 commit d6fde91

File tree

3 files changed

+138
-115
lines changed

3 files changed

+138
-115
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1542,6 +1542,14 @@ class BoUpSLP {
15421542
getGatherCost(FixedVectorType *Ty,
15431543
const DenseSet<unsigned> &ShuffledIndices) const;
15441544

1545+
/// Checks if the gathered \p VL can be represented as shuffle(s) of previous
1546+
/// tree entries.
1547+
/// \returns ShuffleKind, if gathered values can be represented as shuffles of
1548+
/// previous tree entries. \p Mask is filled with the shuffle mask.
1549+
Optional<TargetTransformInfo::ShuffleKind>
1550+
isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
1551+
SmallVectorImpl<const TreeEntry *> &Entries);
1552+
15451553
/// \returns the scalarization cost for this list of values. Assuming that
15461554
/// this subtree gets vectorized, we may need to extract the values from the
15471555
/// roots. This method calculates the cost of extracting the values.
@@ -3560,7 +3568,27 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
35603568
return ReuseShuffleCost + Cost;
35613569
}
35623570
}
3563-
return ReuseShuffleCost + getGatherCost(VL);
3571+
InstructionCost GatherCost = 0;
3572+
SmallVector<int> Mask;
3573+
SmallVector<const TreeEntry *> Entries;
3574+
Optional<TargetTransformInfo::ShuffleKind> Shuffle =
3575+
isGatherShuffledEntry(E, Mask, Entries);
3576+
if (Shuffle.hasValue()) {
3577+
if (ShuffleVectorInst::isIdentityMask(Mask)) {
3578+
LLVM_DEBUG(
3579+
dbgs()
3580+
<< "SLP: perfect diamond match for gather bundle that starts with "
3581+
<< *VL.front() << ".\n");
3582+
} else {
3583+
LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
3584+
<< " entries for bundle that starts with "
3585+
<< *VL.front() << ".\n");
3586+
GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);
3587+
}
3588+
} else {
3589+
GatherCost = getGatherCost(VL);
3590+
}
3591+
return ReuseShuffleCost + GatherCost;
35643592
}
35653593
assert((E->State == TreeEntry::Vectorize ||
35663594
E->State == TreeEntry::ScatterVectorize) &&
@@ -4216,6 +4244,61 @@ InstructionCost BoUpSLP::getTreeCost() {
42164244
return Cost;
42174245
}
42184246

4247+
Optional<TargetTransformInfo::ShuffleKind>
4248+
BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
4249+
SmallVectorImpl<const TreeEntry *> &Entries) {
4250+
auto *VLIt = find_if(VectorizableTree,
4251+
[TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
4252+
return EntryPtr.get() == TE;
4253+
});
4254+
assert(VLIt != VectorizableTree.end() &&
4255+
"Gathered values should be in the tree.");
4256+
Mask.clear();
4257+
Entries.clear();
4258+
DenseMap<const TreeEntry *, int> Used;
4259+
int NumShuffles = 0;
4260+
for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
4261+
Value *V = TE->Scalars[I];
4262+
const TreeEntry *VTE = getTreeEntry(V);
4263+
if (!VTE) {
4264+
// Check if it is used in one of the gathered entries.
4265+
const auto *It =
4266+
find_if(make_range(VectorizableTree.begin(), VLIt),
4267+
[V](const std::unique_ptr<TreeEntry> &EntryPtr) {
4268+
return EntryPtr->State == TreeEntry::NeedToGather &&
4269+
is_contained(EntryPtr->Scalars, V);
4270+
});
4271+
if (It != VLIt)
4272+
VTE = It->get();
4273+
}
4274+
if (VTE) {
4275+
auto Res = Used.try_emplace(VTE, NumShuffles);
4276+
if (Res.second) {
4277+
Entries.push_back(VTE);
4278+
++NumShuffles;
4279+
}
4280+
Mask.push_back(
4281+
Res.first->second * E +
4282+
std::distance(VTE->Scalars.begin(), find(VTE->Scalars, V)));
4283+
continue;
4284+
}
4285+
return None;
4286+
}
4287+
if (NumShuffles == 1) {
4288+
if (ShuffleVectorInst::isReverseMask(Mask))
4289+
return TargetTransformInfo::SK_Reverse;
4290+
return TargetTransformInfo::SK_PermuteSingleSrc;
4291+
}
4292+
if (NumShuffles == 2) {
4293+
if (ShuffleVectorInst::isSelectMask(Mask))
4294+
return TargetTransformInfo::SK_Select;
4295+
if (ShuffleVectorInst::isTransposeMask(Mask))
4296+
return TargetTransformInfo::SK_Transpose;
4297+
return TargetTransformInfo::SK_PermuteTwoSrc;
4298+
}
4299+
return None;
4300+
}
4301+
42194302
InstructionCost
42204303
BoUpSLP::getGatherCost(FixedVectorType *Ty,
42214304
const DenseSet<unsigned> &ShuffledIndices) const {
@@ -4499,7 +4582,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
44994582
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
45004583
if (E->State == TreeEntry::NeedToGather) {
45014584
setInsertPointAfterBundle(E);
4502-
Value *Vec = gather(E->Scalars);
4585+
Value *Vec;
4586+
SmallVector<int> Mask;
4587+
SmallVector<const TreeEntry *> Entries;
4588+
Optional<TargetTransformInfo::ShuffleKind> Shuffle =
4589+
isGatherShuffledEntry(E, Mask, Entries);
4590+
if (Shuffle.hasValue()) {
4591+
assert((Entries.size() == 1 || Entries.size() == 2) &&
4592+
"Expected shuffle of 1 or 2 entries.");
4593+
Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
4594+
Entries.back()->VectorizedValue, Mask);
4595+
} else {
4596+
Vec = gather(E->Scalars);
4597+
}
45034598
if (NeedToShuffleReuses) {
45044599
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
45054600
Vec = ShuffleBuilder.finalize(Vec);

llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu"
1010
; REMARK-LABEL: Function: gather_multiple_use
1111
; REMARK: Args:
1212
; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost '
13-
; REMARK-NEXT: - Cost: '-7'
13+
; REMARK-NEXT: - Cost: '-16'
1414
;
1515
; REMARK-NOT: Function: gather_load
1616

llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll

Lines changed: 40 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -4,124 +4,52 @@
44
define i32 @bar() local_unnamed_addr {
55
; CHECK-LABEL: @bar(
66
; CHECK-NEXT: entry:
7-
; CHECK-NEXT: [[ADD103:%.*]] = add nsw i32 undef, undef
8-
; CHECK-NEXT: [[SUB104:%.*]] = sub nsw i32 undef, undef
9-
; CHECK-NEXT: [[ADD105:%.*]] = add nsw i32 undef, undef
10-
; CHECK-NEXT: [[SUB106:%.*]] = sub nsw i32 undef, undef
11-
; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[ADD103]], 15
12-
; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
13-
; CHECK-NEXT: [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535
14-
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
15-
; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]]
16-
; CHECK-NEXT: [[SHR_I64:%.*]] = lshr i32 [[ADD105]], 15
17-
; CHECK-NEXT: [[AND_I65:%.*]] = and i32 [[SHR_I64]], 65537
18-
; CHECK-NEXT: [[MUL_I66:%.*]] = mul nuw i32 [[AND_I65]], 65535
19-
; CHECK-NEXT: [[ADD_I67:%.*]] = add i32 [[MUL_I66]], [[ADD105]]
20-
; CHECK-NEXT: [[XOR_I68:%.*]] = xor i32 [[ADD_I67]], [[MUL_I66]]
21-
; CHECK-NEXT: [[SHR_I69:%.*]] = lshr i32 [[SUB104]], 15
22-
; CHECK-NEXT: [[AND_I70:%.*]] = and i32 [[SHR_I69]], 65537
23-
; CHECK-NEXT: [[MUL_I71:%.*]] = mul nuw i32 [[AND_I70]], 65535
24-
; CHECK-NEXT: [[ADD_I72:%.*]] = add i32 [[MUL_I71]], [[SUB104]]
25-
; CHECK-NEXT: [[XOR_I73:%.*]] = xor i32 [[ADD_I72]], [[MUL_I71]]
26-
; CHECK-NEXT: [[SHR_I74:%.*]] = lshr i32 [[SUB106]], 15
27-
; CHECK-NEXT: [[AND_I75:%.*]] = and i32 [[SHR_I74]], 65537
28-
; CHECK-NEXT: [[MUL_I76:%.*]] = mul nuw i32 [[AND_I75]], 65535
29-
; CHECK-NEXT: [[ADD_I77:%.*]] = add i32 [[MUL_I76]], [[SUB106]]
30-
; CHECK-NEXT: [[XOR_I78:%.*]] = xor i32 [[ADD_I77]], [[MUL_I76]]
31-
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I68]], [[XOR_I]]
32-
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I73]]
33-
; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I78]]
347
; CHECK-NEXT: [[ADD78_1:%.*]] = add nsw i32 undef, undef
358
; CHECK-NEXT: [[SUB86_1:%.*]] = sub nsw i32 undef, undef
369
; CHECK-NEXT: [[ADD94_1:%.*]] = add nsw i32 undef, undef
3710
; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef
38-
; CHECK-NEXT: [[ADD103_1:%.*]] = add nsw i32 [[ADD94_1]], [[ADD78_1]]
39-
; CHECK-NEXT: [[SUB104_1:%.*]] = sub nsw i32 [[ADD78_1]], [[ADD94_1]]
40-
; CHECK-NEXT: [[ADD105_1:%.*]] = add nsw i32 [[SUB102_1]], [[SUB86_1]]
41-
; CHECK-NEXT: [[SUB106_1:%.*]] = sub nsw i32 [[SUB86_1]], [[SUB102_1]]
42-
; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[ADD103_1]], 15
43-
; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
44-
; CHECK-NEXT: [[MUL_I_1:%.*]] = mul nuw i32 [[AND_I_1]], 65535
45-
; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]]
46-
; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[MUL_I_1]]
47-
; CHECK-NEXT: [[SHR_I64_1:%.*]] = lshr i32 [[ADD105_1]], 15
48-
; CHECK-NEXT: [[AND_I65_1:%.*]] = and i32 [[SHR_I64_1]], 65537
49-
; CHECK-NEXT: [[MUL_I66_1:%.*]] = mul nuw i32 [[AND_I65_1]], 65535
50-
; CHECK-NEXT: [[ADD_I67_1:%.*]] = add i32 [[MUL_I66_1]], [[ADD105_1]]
51-
; CHECK-NEXT: [[XOR_I68_1:%.*]] = xor i32 [[ADD_I67_1]], [[MUL_I66_1]]
52-
; CHECK-NEXT: [[SHR_I69_1:%.*]] = lshr i32 [[SUB104_1]], 15
53-
; CHECK-NEXT: [[AND_I70_1:%.*]] = and i32 [[SHR_I69_1]], 65537
54-
; CHECK-NEXT: [[MUL_I71_1:%.*]] = mul nuw i32 [[AND_I70_1]], 65535
55-
; CHECK-NEXT: [[ADD_I72_1:%.*]] = add i32 [[MUL_I71_1]], [[SUB104_1]]
56-
; CHECK-NEXT: [[XOR_I73_1:%.*]] = xor i32 [[ADD_I72_1]], [[MUL_I71_1]]
57-
; CHECK-NEXT: [[SHR_I74_1:%.*]] = lshr i32 [[SUB106_1]], 15
58-
; CHECK-NEXT: [[AND_I75_1:%.*]] = and i32 [[SHR_I74_1]], 65537
59-
; CHECK-NEXT: [[MUL_I76_1:%.*]] = mul nuw i32 [[AND_I75_1]], 65535
60-
; CHECK-NEXT: [[ADD_I77_1:%.*]] = add i32 [[MUL_I76_1]], [[SUB106_1]]
61-
; CHECK-NEXT: [[XOR_I78_1:%.*]] = xor i32 [[ADD_I77_1]], [[MUL_I76_1]]
62-
; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I68_1]], [[ADD113]]
63-
; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
64-
; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I73_1]]
65-
; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I78_1]]
6611
; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef
67-
; CHECK-NEXT: [[ADD103_2:%.*]] = add nsw i32 undef, [[ADD78_2]]
68-
; CHECK-NEXT: [[SUB104_2:%.*]] = sub nsw i32 [[ADD78_2]], undef
69-
; CHECK-NEXT: [[ADD105_2:%.*]] = add nsw i32 undef, undef
70-
; CHECK-NEXT: [[SUB106_2:%.*]] = sub nsw i32 undef, undef
71-
; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[ADD103_2]], 15
72-
; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
73-
; CHECK-NEXT: [[MUL_I_2:%.*]] = mul nuw i32 [[AND_I_2]], 65535
74-
; CHECK-NEXT: [[ADD_I_2:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]]
75-
; CHECK-NEXT: [[XOR_I_2:%.*]] = xor i32 [[ADD_I_2]], [[MUL_I_2]]
76-
; CHECK-NEXT: [[SHR_I64_2:%.*]] = lshr i32 [[ADD105_2]], 15
77-
; CHECK-NEXT: [[AND_I65_2:%.*]] = and i32 [[SHR_I64_2]], 65537
78-
; CHECK-NEXT: [[MUL_I66_2:%.*]] = mul nuw i32 [[AND_I65_2]], 65535
79-
; CHECK-NEXT: [[ADD_I67_2:%.*]] = add i32 [[MUL_I66_2]], [[ADD105_2]]
80-
; CHECK-NEXT: [[XOR_I68_2:%.*]] = xor i32 [[ADD_I67_2]], [[MUL_I66_2]]
81-
; CHECK-NEXT: [[SHR_I69_2:%.*]] = lshr i32 [[SUB104_2]], 15
82-
; CHECK-NEXT: [[AND_I70_2:%.*]] = and i32 [[SHR_I69_2]], 65537
83-
; CHECK-NEXT: [[MUL_I71_2:%.*]] = mul nuw i32 [[AND_I70_2]], 65535
84-
; CHECK-NEXT: [[ADD_I72_2:%.*]] = add i32 [[MUL_I71_2]], [[SUB104_2]]
85-
; CHECK-NEXT: [[XOR_I73_2:%.*]] = xor i32 [[ADD_I72_2]], [[MUL_I71_2]]
86-
; CHECK-NEXT: [[SHR_I74_2:%.*]] = lshr i32 [[SUB106_2]], 15
87-
; CHECK-NEXT: [[AND_I75_2:%.*]] = and i32 [[SHR_I74_2]], 65537
88-
; CHECK-NEXT: [[MUL_I76_2:%.*]] = mul nuw i32 [[AND_I75_2]], 65535
89-
; CHECK-NEXT: [[ADD_I77_2:%.*]] = add i32 [[MUL_I76_2]], [[SUB106_2]]
90-
; CHECK-NEXT: [[XOR_I78_2:%.*]] = xor i32 [[ADD_I77_2]], [[MUL_I76_2]]
91-
; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I68_2]], [[ADD113_1]]
92-
; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[XOR_I_2]]
93-
; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[XOR_I73_2]]
94-
; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I78_2]]
9512
; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef
96-
; CHECK-NEXT: [[ADD103_3:%.*]] = add nsw i32 undef, undef
97-
; CHECK-NEXT: [[SUB104_3:%.*]] = sub nsw i32 undef, undef
98-
; CHECK-NEXT: [[ADD105_3:%.*]] = add nsw i32 [[SUB102_3]], undef
99-
; CHECK-NEXT: [[SUB106_3:%.*]] = sub nsw i32 undef, [[SUB102_3]]
100-
; CHECK-NEXT: [[SHR_I_3:%.*]] = lshr i32 [[ADD103_3]], 15
101-
; CHECK-NEXT: [[AND_I_3:%.*]] = and i32 [[SHR_I_3]], 65537
102-
; CHECK-NEXT: [[MUL_I_3:%.*]] = mul nuw i32 [[AND_I_3]], 65535
103-
; CHECK-NEXT: [[ADD_I_3:%.*]] = add i32 [[MUL_I_3]], [[ADD103_3]]
104-
; CHECK-NEXT: [[XOR_I_3:%.*]] = xor i32 [[ADD_I_3]], [[MUL_I_3]]
105-
; CHECK-NEXT: [[SHR_I64_3:%.*]] = lshr i32 [[ADD105_3]], 15
106-
; CHECK-NEXT: [[AND_I65_3:%.*]] = and i32 [[SHR_I64_3]], 65537
107-
; CHECK-NEXT: [[MUL_I66_3:%.*]] = mul nuw i32 [[AND_I65_3]], 65535
108-
; CHECK-NEXT: [[ADD_I67_3:%.*]] = add i32 [[MUL_I66_3]], [[ADD105_3]]
109-
; CHECK-NEXT: [[XOR_I68_3:%.*]] = xor i32 [[ADD_I67_3]], [[MUL_I66_3]]
110-
; CHECK-NEXT: [[SHR_I69_3:%.*]] = lshr i32 [[SUB104_3]], 15
111-
; CHECK-NEXT: [[AND_I70_3:%.*]] = and i32 [[SHR_I69_3]], 65537
112-
; CHECK-NEXT: [[MUL_I71_3:%.*]] = mul nuw i32 [[AND_I70_3]], 65535
113-
; CHECK-NEXT: [[ADD_I72_3:%.*]] = add i32 [[MUL_I71_3]], [[SUB104_3]]
114-
; CHECK-NEXT: [[XOR_I73_3:%.*]] = xor i32 [[ADD_I72_3]], [[MUL_I71_3]]
115-
; CHECK-NEXT: [[SHR_I74_3:%.*]] = lshr i32 [[SUB106_3]], 15
116-
; CHECK-NEXT: [[AND_I75_3:%.*]] = and i32 [[SHR_I74_3]], 65537
117-
; CHECK-NEXT: [[MUL_I76_3:%.*]] = mul nuw i32 [[AND_I75_3]], 65535
118-
; CHECK-NEXT: [[ADD_I77_3:%.*]] = add i32 [[MUL_I76_3]], [[SUB106_3]]
119-
; CHECK-NEXT: [[XOR_I78_3:%.*]] = xor i32 [[ADD_I77_3]], [[MUL_I76_3]]
120-
; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I68_3]], [[ADD113_2]]
121-
; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[XOR_I_3]]
122-
; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[XOR_I73_3]]
123-
; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I78_3]]
124-
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD113_3]], 16
13+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_3]], i32 0
14+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 undef, i32 1
15+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[SUB102_1]], i32 2
16+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 undef, i32 3
17+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 undef, i32 4
18+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 undef, i32 5
19+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 undef, i32 6
20+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 7
21+
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[ADD78_1]], i32 8
22+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB86_1]], i32 9
23+
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 undef, i32 10
24+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[ADD78_2]], i32 11
25+
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 undef, i32 12
26+
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 undef, i32 13
27+
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 undef, i32 14
28+
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 undef, i32 15
29+
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[SUB86_1]], i32 2
30+
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 undef, i32 3
31+
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 undef, i32 4
32+
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 undef, i32 5
33+
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 undef, i32 6
34+
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[ADD78_1]], i32 7
35+
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[ADD94_1]], i32 8
36+
; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[SUB102_1]], i32 9
37+
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[ADD78_2]], i32 10
38+
; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 undef, i32 11
39+
; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 undef, i32 12
40+
; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 undef, i32 13
41+
; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 undef, i32 14
42+
; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[SUB102_3]], i32 15
43+
; CHECK-NEXT: [[TMP30:%.*]] = add nsw <16 x i32> [[TMP15]], [[TMP29]]
44+
; CHECK-NEXT: [[TMP31:%.*]] = sub nsw <16 x i32> [[TMP15]], [[TMP29]]
45+
; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i32> [[TMP30]], <16 x i32> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 21, i32 22, i32 7, i32 24, i32 25, i32 10, i32 27, i32 28, i32 13, i32 30, i32 31>
46+
; CHECK-NEXT: [[TMP33:%.*]] = lshr <16 x i32> [[TMP32]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
47+
; CHECK-NEXT: [[TMP34:%.*]] = and <16 x i32> [[TMP33]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
48+
; CHECK-NEXT: [[TMP35:%.*]] = mul nuw <16 x i32> [[TMP34]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
49+
; CHECK-NEXT: [[TMP36:%.*]] = add <16 x i32> [[TMP35]], [[TMP32]]
50+
; CHECK-NEXT: [[TMP37:%.*]] = xor <16 x i32> [[TMP36]], [[TMP35]]
51+
; CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP37]])
52+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP38]], 16
12553
; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 undef, [[SHR]]
12654
; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
12755
; CHECK-NEXT: ret i32 [[SHR120]]

0 commit comments

Comments
 (0)