Skip to content

Commit 72a0396

Browse files
committed
[SLP] Vectorize non-power-of-2 ops with padding.
This patch introduces a new VectorizeWithPadding node type for root and leave nodes to allow vectorizing loads/stores with non-power-of-2 number of elements. VectorizeWithPadding load nodes will pad the result to the next power of 2 with poison elements. Non-leaf nodes will operate on normal power-of-2 vectors. For those non-leaf nodes, we still track the number of padding elements needed to go to the next power-of-2, to be used in various places, like cost computation. VectorizeWithPadding store nodes strip away the padding elements and store the non-power-of-2 number of data elements. Note that re-ordering and shuffling is not implemented for nodes requiring padding yet to keep the initial implementation simpler. The initial implementation also only tries to vectorize with padding if original number of elements + 1 is a power-of-2, i.e. if only a single padding element is needed. The feature is guarded by a new flag, off by defaul for now.
1 parent 5330daa commit 72a0396

File tree

7 files changed

+686
-350
lines changed

7 files changed

+686
-350
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 99 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,10 @@ static cl::opt<bool>
179179
ViewSLPTree("view-slp-tree", cl::Hidden,
180180
cl::desc("Display the SLP trees with Graphviz"));
181181

182+
static cl::opt<bool> VectorizeNonPowerOf2(
183+
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
184+
cl::desc("Try to vectorize with non-power-of-2 with number of elements."));
185+
182186
// Limit the number of alias checks. The limit is chosen so that
183187
// it has no negative effect on the llvm benchmarks.
184188
static const unsigned AliasedCheckLimit = 10;
@@ -2733,6 +2737,9 @@ class BoUpSLP {
27332737
SmallVectorImpl<Value *> *OpScalars = nullptr,
27342738
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
27352739

2740+
/// Return the number of padding lanes (containg poison) for this node.
2741+
bool isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); }
2742+
27362743
#ifndef NDEBUG
27372744
/// Debug printer.
27382745
LLVM_DUMP_METHOD void dump() const {
@@ -2891,9 +2898,13 @@ class BoUpSLP {
28912898
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
28922899
}
28932900

2894-
if (UserTreeIdx.UserTE)
2901+
if (UserTreeIdx.UserTE) {
28952902
Last->UserTreeIndices.push_back(UserTreeIdx);
2896-
2903+
if (!isPowerOf2_32(Last->Scalars.size())) {
2904+
assert((Last->ReorderIndices.empty()) &&
2905+
"Reodering isn't implemented for nodes with padding yet");
2906+
}
2907+
}
28972908
return Last;
28982909
}
28992910

@@ -2921,7 +2932,8 @@ class BoUpSLP {
29212932
/// and fills required data before actual scheduling of the instructions.
29222933
TreeEntry::EntryState getScalarsVectorizationState(
29232934
InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
2924-
OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
2935+
OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps,
2936+
bool HasPadding) const;
29252937

29262938
/// Maps a specific scalar to its tree entry.
29272939
SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
@@ -3881,6 +3893,9 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
38813893
Order.clear();
38823894
// Check the order of pointer operands or that all pointers are the same.
38833895
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
3896+
if (!Order.empty() && !isPowerOf2_32(VL.size()))
3897+
return LoadsState::Gather;
3898+
38843899
if (IsSorted || all_of(PointerOps, [&](Value *P) {
38853900
return arePointersCompatible(P, PointerOps.front(), TLI);
38863901
})) {
@@ -4568,6 +4583,10 @@ bool BoUpSLP::canReorderOperands(
45684583
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
45694584
ArrayRef<TreeEntry *> ReorderableGathers,
45704585
SmallVectorImpl<TreeEntry *> &GatherOps) {
4586+
// Reordering isn't implemented for nodes with padding yet.
4587+
if (UserTE->isNonPowOf2Vec())
4588+
return false;
4589+
45714590
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
45724591
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
45734592
return OpData.first == I &&
@@ -4746,6 +4765,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
47464765
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
47474766
const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
47484767
const TreeEntry *TE) {
4768+
// Reordering for nodes with padding not implemented yet.
4769+
if (TE->isNonPowOf2Vec())
4770+
return false;
47494771
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
47504772
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
47514773
(IgnoreReorder && TE->Idx == 0))
@@ -5233,7 +5255,8 @@ static bool isAlternateInstruction(const Instruction *I,
52335255

52345256
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
52355257
InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
5236-
OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
5258+
OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps,
5259+
bool HasPadding) const {
52375260
assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
52385261

52395262
unsigned ShuffleOrOp =
@@ -5256,7 +5279,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
52565279
}
52575280
case Instruction::ExtractValue:
52585281
case Instruction::ExtractElement: {
5259-
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
5282+
bool Reuse = !HasPadding && canReuseExtract(VL, VL0, CurrentOrder);
52605283
if (Reuse || !CurrentOrder.empty())
52615284
return TreeEntry::Vectorize;
52625285
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -5583,6 +5606,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
55835606
if (PWSz == VL.size()) {
55845607
ReuseShuffleIndicies.clear();
55855608
} else {
5609+
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
5610+
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
5611+
"for nodes with padding.\n");
5612+
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5613+
return false;
5614+
}
55865615
NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
55875616
NonUniqueValueVL.append(PWSz - UniqueValues.size(),
55885617
UniqueValues.back());
@@ -5594,6 +5623,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
55945623
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
55955624
return false;
55965625
}
5626+
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
5627+
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported for "
5628+
"nodes with padding.\n");
5629+
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5630+
return false;
5631+
}
55975632
VL = UniqueValues;
55985633
}
55995634
return true;
@@ -5859,7 +5894,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
58595894
OrdersType CurrentOrder;
58605895
SmallVector<Value *> PointerOps;
58615896
TreeEntry::EntryState State = getScalarsVectorizationState(
5862-
S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
5897+
S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps,
5898+
UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec());
58635899
if (State == TreeEntry::NeedToGather) {
58645900
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
58655901
ReuseShuffleIndicies);
@@ -6955,7 +6991,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
69556991
return Constant::getAllOnesValue(Ty);
69566992
}
69576993

6958-
InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
6994+
InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root,
6995+
bool WithPadding = false) {
69596996
if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
69606997
return TTI::TCC_Free;
69616998
auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
@@ -6966,7 +7003,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
69667003
InstructionsState S = getSameOpcode(VL, *R.TLI);
69677004
const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
69687005
unsigned MinVF = R.getMinVF(2 * Sz);
6969-
if (VL.size() > 2 &&
7006+
if (!WithPadding && VL.size() > 2 &&
69707007
((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
69717008
(InVectors.empty() &&
69727009
any_of(seq<unsigned>(0, VL.size() / MinVF),
@@ -7077,7 +7114,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
70777114
}
70787115
GatherCost -= ScalarsCost;
70797116
}
7080-
} else if (!Root && isSplat(VL)) {
7117+
} else if (!WithPadding && !Root && isSplat(VL)) {
70817118
// Found the broadcasting of the single scalar, calculate the cost as
70827119
// the broadcast.
70837120
const auto *It =
@@ -7640,8 +7677,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
76407677
CommonMask[Idx] = Mask[Idx] + VF;
76417678
}
76427679
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
7643-
Value *Root = nullptr) {
7644-
Cost += getBuildVectorCost(VL, Root);
7680+
Value *Root = nullptr, bool WithPadding = false) {
7681+
Cost += getBuildVectorCost(VL, Root, WithPadding);
76457682
if (!Root) {
76467683
// FIXME: Need to find a way to avoid use of getNullValue here.
76477684
SmallVector<Constant *> Vals;
@@ -9710,6 +9747,9 @@ BoUpSLP::isGatherShuffledEntry(
97109747
// No need to check for the topmost gather node.
97119748
if (TE == VectorizableTree.front().get())
97129749
return {};
9750+
// Gathering for nodes with padding is not implemented yet.
9751+
if (TE->isNonPowOf2Vec())
9752+
return {};
97139753
Mask.assign(VL.size(), PoisonMaskElem);
97149754
assert(TE->UserTreeIndices.size() == 1 &&
97159755
"Expected only single user of the gather node.");
@@ -10422,7 +10462,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1042210462
add(V1, NewMask);
1042310463
}
1042410464
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10425-
Value *Root = nullptr) {
10465+
Value *Root = nullptr, bool WithPadding = false) {
1042610466
return R.gather(VL, Root);
1042710467
}
1042810468
Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
@@ -10491,7 +10531,6 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
1049110531
SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
1049210532
reorderScalars(VL, Mask);
1049310533
}
10494-
const unsigned VF = VL.size();
1049510534
InstructionsState S = getSameOpcode(VL, *TLI);
1049610535
// Special processing for GEPs bundle, which may include non-gep values.
1049710536
if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
@@ -10533,6 +10572,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
1053310572
ShuffleBuilder.add(V, Mask);
1053410573
return ShuffleBuilder.finalize(std::nullopt);
1053510574
};
10575+
const unsigned VF = VL.size();
1053610576
Value *V = vectorizeTree(VE, PostponedPHIs);
1053710577
if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
1053810578
if (!VE->ReuseShuffleIndices.empty()) {
@@ -10659,6 +10699,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
1065910699
return true;
1066010700
};
1066110701
BVTy ShuffleBuilder(Params...);
10702+
if (E->isNonPowOf2Vec()) {
10703+
Value *BV = ShuffleBuilder.gather(E->Scalars, 0, nullptr, true);
10704+
SmallVector<int> Mask(VF, PoisonMaskElem);
10705+
std::iota(Mask.begin(), Mask.begin() + E->Scalars.size(), 0);
10706+
ShuffleBuilder.add(BV, Mask);
10707+
return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
10708+
}
10709+
1066210710
ResTy Res = ResTy();
1066310711
SmallVector<int> Mask;
1066410712
SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
@@ -13422,8 +13470,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1342213470
const unsigned Sz = R.getVectorElementSize(Chain[0]);
1342313471
unsigned VF = Chain.size();
1342413472

13425-
if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
13426-
return false;
13473+
if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
13474+
// Check if vectorizing with a non-power-of-2 VF should be considered. At
13475+
// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
13476+
// all vector lanes are used.
13477+
if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
13478+
return false;
13479+
}
1342713480

1342813481
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
1342913482
<< "\n");
@@ -13519,9 +13572,39 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
1351913572
<< "MinVF (" << MinVF << ")\n");
1352013573
}
1352113574

13575+
unsigned StartIdx = 0;
13576+
if (VectorizeNonPowerOf2) {
13577+
// Try vectorizing with a non-power-of-2 VF. At the moment, only
13578+
// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
13579+
// lanes are used.
13580+
unsigned CandVF = Operands.size() + 1;
13581+
if (isPowerOf2_32(CandVF) && CandVF <= MaxVF) {
13582+
assert(
13583+
all_of(
13584+
Operands,
13585+
[&](Value *V) {
13586+
return cast<StoreInst>(V)->getValueOperand()->getType() ==
13587+
cast<StoreInst>(Operands.front())
13588+
->getValueOperand()
13589+
->getType();
13590+
}) &&
13591+
"Expected all operands of same type.");
13592+
if (!VectorizedStores.count(Operands.front()) &&
13593+
!VectorizedStores.count(Operands.back()) &&
13594+
TriedSequences
13595+
.insert(std::make_pair(Operands.front(), Operands.back()))
13596+
.second &&
13597+
vectorizeStoreChain(Operands, R, Operands.size(), MinVF)) {
13598+
// Mark the vectorized stores so that we don't vectorize them again.
13599+
VectorizedStores.insert(Operands.begin(), Operands.end());
13600+
Changed = true;
13601+
StartIdx += Operands.size();
13602+
}
13603+
}
13604+
}
13605+
1352213606
// FIXME: Is division-by-2 the correct step? Should we assert that the
1352313607
// register size is a power-of-2?
13524-
unsigned StartIdx = 0;
1352513608
for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
1352613609
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
1352713610
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);

llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll

Lines changed: 39 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,45 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
2+
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NON-POW2 %s
3+
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=POW2-ONLY %s
34

45
define void @v15_load_i8_mul_by_constant_store(ptr %src, ptr noalias %dst) {
5-
; CHECK-LABEL: define void @v15_load_i8_mul_by_constant_store(
6-
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
7-
; CHECK-NEXT: entry:
8-
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
9-
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
10-
; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
11-
; CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
12-
; CHECK-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
13-
; CHECK-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
14-
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
15-
; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
16-
; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
17-
; CHECK-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
18-
; CHECK-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
19-
; CHECK-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
20-
; CHECK-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
21-
; CHECK-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
22-
; CHECK-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
23-
; CHECK-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
24-
; CHECK-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
25-
; CHECK-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
26-
; CHECK-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
27-
; CHECK-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
28-
; CHECK-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
29-
; CHECK-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
30-
; CHECK-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
31-
; CHECK-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
32-
; CHECK-NEXT: ret void
6+
; NON-POW2-LABEL: define void @v15_load_i8_mul_by_constant_store(
7+
; NON-POW2-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
8+
; NON-POW2-NEXT: entry:
9+
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
10+
; NON-POW2-NEXT: [[TMP0:%.*]] = load <15 x i8>, ptr [[GEP_SRC_0]], align 4
11+
; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <15 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
12+
; NON-POW2-NEXT: store <15 x i8> [[TMP1]], ptr [[DST]], align 1
13+
; NON-POW2-NEXT: ret void
14+
;
15+
; POW2-ONLY-LABEL: define void @v15_load_i8_mul_by_constant_store(
16+
; POW2-ONLY-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
17+
; POW2-ONLY-NEXT: entry:
18+
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
19+
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
20+
; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
21+
; POW2-ONLY-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
22+
; POW2-ONLY-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
23+
; POW2-ONLY-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
24+
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
25+
; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
26+
; POW2-ONLY-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
27+
; POW2-ONLY-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
28+
; POW2-ONLY-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
29+
; POW2-ONLY-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
30+
; POW2-ONLY-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
31+
; POW2-ONLY-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
32+
; POW2-ONLY-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
33+
; POW2-ONLY-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
34+
; POW2-ONLY-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
35+
; POW2-ONLY-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
36+
; POW2-ONLY-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
37+
; POW2-ONLY-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
38+
; POW2-ONLY-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
39+
; POW2-ONLY-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
40+
; POW2-ONLY-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
41+
; POW2-ONLY-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
42+
; POW2-ONLY-NEXT: ret void
3343
;
3444
entry:
3545
%gep.src.0 = getelementptr inbounds i8, ptr %src, i8 0
@@ -123,5 +133,3 @@ entry:
123133

124134
ret void
125135
}
126-
127-

0 commit comments

Comments
 (0)