llvm
diff --git a/‎llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Lines changed: 99 additions & 16 deletions b/‎llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Lines changed: 99 additions & 16 deletions
diff --git a/‎llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll
Lines changed: 39 additions & 31 deletions b/‎llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll
Lines changed: 39 additions & 31 deletions
@@ -179,6 +179,10 @@ static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
 
+static cl::opt<bool> VectorizeNonPowerOf2(
+    "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
+    cl::desc("Try to vectorize with non-power-of-2 with number of elements."));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -2733,6 +2737,9 @@ class BoUpSLP {
                           SmallVectorImpl<Value *> *OpScalars = nullptr,
                           SmallVectorImpl<Value *> *AltScalars = nullptr) const;
 
+    /// Return the number of padding lanes (containg poison) for this node.
+    bool isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); }
+
 #ifndef NDEBUG
     /// Debug printer.
     LLVM_DUMP_METHOD void dump() const {
@@ -2891,9 +2898,13 @@ class BoUpSLP {
           ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
     }
 
-    if (UserTreeIdx.UserTE)
+    if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
-
+      if (!isPowerOf2_32(Last->Scalars.size())) {
+        assert((Last->ReorderIndices.empty()) &&
+               "Reodering isn't implemented for nodes with padding yet");
+      }
+    }
     return Last;
   }
 
@@ -2921,7 +2932,8 @@ class BoUpSLP {
   /// and fills required data before actual scheduling of the instructions.
   TreeEntry::EntryState getScalarsVectorizationState(
       InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
-      OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
+      OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps,
+      bool HasPadding) const;
 
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
@@ -3881,6 +3893,9 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   Order.clear();
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
+  if (!Order.empty() && !isPowerOf2_32(VL.size()))
+    return LoadsState::Gather;
+
   if (IsSorted || all_of(PointerOps, [&](Value *P) {
         return arePointersCompatible(P, PointerOps.front(), TLI);
       })) {
@@ -4568,6 +4583,10 @@ bool BoUpSLP::canReorderOperands(
     TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
     ArrayRef<TreeEntry *> ReorderableGathers,
     SmallVectorImpl<TreeEntry *> &GatherOps) {
+  // Reordering isn't implemented for nodes with padding yet.
+  if (UserTE->isNonPowOf2Vec())
+    return false;
+
   for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
     if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
           return OpData.first == I &&
@@ -4746,6 +4765,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
         const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
                                             const TreeEntry *TE) {
+          // Reordering for nodes with padding not implemented yet.
+          if (TE->isNonPowOf2Vec())
+            return false;
           if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
               (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
               (IgnoreReorder && TE->Idx == 0))
@@ -5233,7 +5255,8 @@ static bool isAlternateInstruction(const Instruction *I,
 
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
-    OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
+    OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps,
+    bool HasPadding) const {
   assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
 
   unsigned ShuffleOrOp =
@@ -5256,7 +5279,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   }
   case Instruction::ExtractValue:
   case Instruction::ExtractElement: {
-    bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+    bool Reuse = !HasPadding && canReuseExtract(VL, VL0, CurrentOrder);
     if (Reuse || !CurrentOrder.empty())
       return TreeEntry::Vectorize;
     LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -5583,6 +5606,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           if (PWSz == VL.size()) {
             ReuseShuffleIndicies.clear();
           } else {
+            if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+              LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
+                                   "for nodes with padding.\n");
+              newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+              return false;
+            }
             NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
             NonUniqueValueVL.append(PWSz - UniqueValues.size(),
                                     UniqueValues.back());
@@ -5594,6 +5623,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
         return false;
       }
+      if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+        LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported for "
+                             "nodes with padding.\n");
+        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+        return false;
+      }
       VL = UniqueValues;
     }
     return true;
@@ -5859,7 +5894,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   OrdersType CurrentOrder;
   SmallVector<Value *> PointerOps;
   TreeEntry::EntryState State = getScalarsVectorizationState(
-      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps,
+      UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec());
   if (State == TreeEntry::NeedToGather) {
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndicies);
@@ -6955,7 +6991,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     return Constant::getAllOnesValue(Ty);
   }
 
-  InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
+  InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root,
+                                     bool WithPadding = false) {
     if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
       return TTI::TCC_Free;
     auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
@@ -6966,7 +7003,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     InstructionsState S = getSameOpcode(VL, *R.TLI);
     const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
     unsigned MinVF = R.getMinVF(2 * Sz);
-    if (VL.size() > 2 &&
+    if (!WithPadding && VL.size() > 2 &&
         ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
          (InVectors.empty() &&
           any_of(seq<unsigned>(0, VL.size() / MinVF),
@@ -7077,7 +7114,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         GatherCost -= ScalarsCost;
       }
-    } else if (!Root && isSplat(VL)) {
+    } else if (!WithPadding && !Root && isSplat(VL)) {
       // Found the broadcasting of the single scalar, calculate the cost as
       // the broadcast.
       const auto *It =
@@ -7640,8 +7677,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         CommonMask[Idx] = Mask[Idx] + VF;
   }
   Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
-                Value *Root = nullptr) {
-    Cost += getBuildVectorCost(VL, Root);
+                Value *Root = nullptr, bool WithPadding = false) {
+    Cost += getBuildVectorCost(VL, Root, WithPadding);
     if (!Root) {
       // FIXME: Need to find a way to avoid use of getNullValue here.
       SmallVector<Constant *> Vals;
@@ -9710,6 +9747,9 @@ BoUpSLP::isGatherShuffledEntry(
   // No need to check for the topmost gather node.
   if (TE == VectorizableTree.front().get())
     return {};
+  // Gathering for nodes with padding is not implemented yet.
+  if (TE->isNonPowOf2Vec())
+    return {};
   Mask.assign(VL.size(), PoisonMaskElem);
   assert(TE->UserTreeIndices.size() == 1 &&
          "Expected only single user of the gather node.");
@@ -10422,7 +10462,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
     add(V1, NewMask);
   }
   Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
-                Value *Root = nullptr) {
+                Value *Root = nullptr, bool WithPadding = false) {
     return R.gather(VL, Root);
   }
   Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
@@ -10491,7 +10531,6 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
     SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
     reorderScalars(VL, Mask);
   }
-  const unsigned VF = VL.size();
   InstructionsState S = getSameOpcode(VL, *TLI);
   // Special processing for GEPs bundle, which may include non-gep values.
   if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
@@ -10533,6 +10572,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
         ShuffleBuilder.add(V, Mask);
         return ShuffleBuilder.finalize(std::nullopt);
       };
+      const unsigned VF = VL.size();
       Value *V = vectorizeTree(VE, PostponedPHIs);
       if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
         if (!VE->ReuseShuffleIndices.empty()) {
@@ -10659,6 +10699,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     return true;
   };
   BVTy ShuffleBuilder(Params...);
+  if (E->isNonPowOf2Vec()) {
+    Value *BV = ShuffleBuilder.gather(E->Scalars, 0, nullptr, true);
+    SmallVector<int> Mask(VF, PoisonMaskElem);
+    std::iota(Mask.begin(), Mask.begin() + E->Scalars.size(), 0);
+    ShuffleBuilder.add(BV, Mask);
+    return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+  }
+
   ResTy Res = ResTy();
   SmallVector<int> Mask;
   SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
@@ -13422,8 +13470,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   const unsigned Sz = R.getVectorElementSize(Chain[0]);
   unsigned VF = Chain.size();
 
-  if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
-    return false;
+  if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
+    // Check if vectorizing with a non-power-of-2 VF should be considered. At
+    // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
+    // all vector lanes are used.
+    if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
+      return false;
+  }
 
   LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
                     << "\n");
@@ -13519,9 +13572,39 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
                           << "MinVF (" << MinVF << ")\n");
       }
 
+      unsigned StartIdx = 0;
+      if (VectorizeNonPowerOf2) {
+        // Try vectorizing with a non-power-of-2 VF. At the moment, only
+        // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
+        // lanes are used.
+        unsigned CandVF = Operands.size() + 1;
+        if (isPowerOf2_32(CandVF) && CandVF <= MaxVF) {
+          assert(
+              all_of(
+                  Operands,
+                  [&](Value *V) {
+                    return cast<StoreInst>(V)->getValueOperand()->getType() ==
+                           cast<StoreInst>(Operands.front())
+                               ->getValueOperand()
+                               ->getType();
+                  }) &&
+              "Expected all operands of same type.");
+          if (!VectorizedStores.count(Operands.front()) &&
+              !VectorizedStores.count(Operands.back()) &&
+              TriedSequences
+                  .insert(std::make_pair(Operands.front(), Operands.back()))
+                  .second &&
+              vectorizeStoreChain(Operands, R, Operands.size(), MinVF)) {
+            // Mark the vectorized stores so that we don't vectorize them again.
+            VectorizedStores.insert(Operands.begin(), Operands.end());
+            Changed = true;
+            StartIdx += Operands.size();
+          }
+        }
+      }
+
       // FIXME: Is division-by-2 the correct step? Should we assert that the
       // register size is a power-of-2?
-      unsigned StartIdx = 0;
       for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
         for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
           ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
 
@@ -1,35 +1,45 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=POW2-ONLY %s
 
 define void @v15_load_i8_mul_by_constant_store(ptr %src, ptr noalias %dst) {
-; CHECK-LABEL: define void @v15_load_i8_mul_by_constant_store(
-; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
-; CHECK-NEXT:    store <8 x i8> [[TMP1]], ptr [[DST]], align 1
-; CHECK-NEXT:    [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
-; CHECK-NEXT:    [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
-; CHECK-NEXT:    store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
-; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
-; CHECK-NEXT:    [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
-; CHECK-NEXT:    [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
-; CHECK-NEXT:    [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
-; CHECK-NEXT:    store i8 [[MUL_12]], ptr [[DST_12]], align 1
-; CHECK-NEXT:    [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
-; CHECK-NEXT:    [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
-; CHECK-NEXT:    [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
-; CHECK-NEXT:    [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
-; CHECK-NEXT:    store i8 [[MUL_13]], ptr [[DST_13]], align 1
-; CHECK-NEXT:    [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
-; CHECK-NEXT:    [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
-; CHECK-NEXT:    [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
-; CHECK-NEXT:    [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
-; CHECK-NEXT:    store i8 [[MUL_14]], ptr [[DST_14]], align 1
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: define void @v15_load_i8_mul_by_constant_store(
+; NON-POW2-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load <15 x i8>, ptr [[GEP_SRC_0]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = mul nsw <15 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
+; NON-POW2-NEXT:    store <15 x i8> [[TMP1]], ptr [[DST]], align 1
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: define void @v15_load_i8_mul_by_constant_store(
+; POW2-ONLY-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
+; POW2-ONLY-NEXT:    store <8 x i8> [[TMP1]], ptr [[DST]], align 1
+; POW2-ONLY-NEXT:    [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
+; POW2-ONLY-NEXT:    [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
+; POW2-ONLY-NEXT:    store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
+; POW2-ONLY-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
+; POW2-ONLY-NEXT:    [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
+; POW2-ONLY-NEXT:    [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
+; POW2-ONLY-NEXT:    [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
+; POW2-ONLY-NEXT:    store i8 [[MUL_12]], ptr [[DST_12]], align 1
+; POW2-ONLY-NEXT:    [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
+; POW2-ONLY-NEXT:    [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
+; POW2-ONLY-NEXT:    [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
+; POW2-ONLY-NEXT:    [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
+; POW2-ONLY-NEXT:    store i8 [[MUL_13]], ptr [[DST_13]], align 1
+; POW2-ONLY-NEXT:    [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
+; POW2-ONLY-NEXT:    [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
+; POW2-ONLY-NEXT:    [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
+; POW2-ONLY-NEXT:    [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
+; POW2-ONLY-NEXT:    store i8 [[MUL_14]], ptr [[DST_14]], align 1
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %gep.src.0 = getelementptr inbounds i8, ptr %src, i8 0
@@ -123,5 +133,3 @@ entry:
 
   ret void
 }
-
-