llvm · HanKuanChen · Dec 13, 2024 · Oct 3, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -832,8 +832,103 @@ struct InstructionsState {
       : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
 };
 
+struct InterchangeableInstruction {
+  unsigned Opcode;
+  SmallVector<Value *> Ops;
+  template <class... ArgTypes>
+  InterchangeableInstruction(unsigned Opcode, ArgTypes &&...Args)
+      : Opcode(Opcode), Ops{std::forward<decltype(Args)>(Args)...} {}
+};
+
+bool operator<(const InterchangeableInstruction &LHS,
+               const InterchangeableInstruction &RHS) {
+  return LHS.Opcode < RHS.Opcode;
+}
+
 } // end anonymous namespace
 
+/// \returns a sorted list of interchangeable instructions by instruction opcode
+/// that \p I can be converted to.
+/// e.g.,
+/// x << y -> x * (2^y)
+/// x << 1 -> x *   2
+/// x << 0 -> x *   1   -> x - 0 -> x + 0 -> x & 11...1 -> x | 0
+///           x *   0                     -> x & 0
+///           x *  -1   -> 0 - x
+/// TODO: support more patterns
+static SmallVector<InterchangeableInstruction>
+getInterchangeableInstruction(Instruction *I) {
+  // PII = Possible Interchangeable Instruction
+  SmallVector<InterchangeableInstruction> PII;
+  unsigned Opcode = I->getOpcode();
+  PII.emplace_back(Opcode, I->operands());
+  if (!is_contained({Instruction::Shl, Instruction::Mul, Instruction::Sub,
+                     Instruction::Add},
+                    Opcode))
+    return PII;
+  Constant *C;
+  if (match(I, m_BinOp(m_Value(), m_Constant(C)))) {
+    ConstantInt *V = nullptr;
+    if (auto *CI = dyn_cast<ConstantInt>(C)) {
+      V = CI;
+    } else if (auto *CDV = dyn_cast<ConstantDataVector>(C)) {
+      if (auto *CI = dyn_cast_if_present<ConstantInt>(CDV->getSplatValue()))
+        V = CI;
+    }
+    if (!V)
+      return PII;
+    Value *Op0 = I->getOperand(0);
+    Type *Op1Ty = I->getOperand(1)->getType();
+    const APInt &Op1Int = V->getValue();
+    Constant *Zero =
+        ConstantInt::get(Op1Ty, APInt::getZero(Op1Int.getBitWidth()));
+    Constant *UnsignedMax =
+        ConstantInt::get(Op1Ty, APInt::getMaxValue(Op1Int.getBitWidth()));
+    switch (Opcode) {
+    case Instruction::Shl: {
+      PII.emplace_back(Instruction::Mul, Op0,
+                       ConstantInt::get(Op1Ty, 1 << Op1Int.getZExtValue()));
+      if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::Sub, Op0, Zero);
+        PII.emplace_back(Instruction::Add, Op0, Zero);
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      }
+      break;
+    }
+    case Instruction::Mul: {
+      if (Op1Int.isOne()) {
+        PII.emplace_back(Instruction::Sub, Op0, Zero);
+        PII.emplace_back(Instruction::Add, Op0, Zero);
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      } else if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::And, Op0, Zero);
+      } else if (Op1Int.isAllOnes()) {
+        PII.emplace_back(Instruction::Sub, Zero, Op0);
+      }
+      break;
+    }
+    case Instruction::Sub:
+      if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::Add, Op0, Zero);
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      }
+      break;
+    case Instruction::Add:
+      if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      }
+      break;
+    }
+  }
+  // std::set_intersection requires a sorted range.
+  sort(PII);
+  return PII;
+}
+
 /// \returns true if \p Opcode is allowed as part of the main/alternate
 /// instruction for SLP vectorization.
 ///
@@ -938,18 +1033,52 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
       return InstructionsState(VL[BaseIndex], nullptr, nullptr);
   }
+  // Currently, this is only used for binary ops.
+  // TODO: support all instructions
+  SmallVector<InterchangeableInstruction> InterchangeableOpcode =
+      getInterchangeableInstruction(cast<Instruction>(VL[BaseIndex]));
+  SmallVector<InterchangeableInstruction> AlternateInterchangeableOpcode;
+  auto UpdateInterchangeableOpcode =
+      [](SmallVector<InterchangeableInstruction> &LHS,
+         ArrayRef<InterchangeableInstruction> RHS) {
+        SmallVector<InterchangeableInstruction> NewInterchangeableOpcode;
+        std::set_intersection(LHS.begin(), LHS.end(), RHS.begin(), RHS.end(),
+                              std::back_inserter(NewInterchangeableOpcode));
+        if (NewInterchangeableOpcode.empty())
+          return false;
+        LHS.swap(NewInterchangeableOpcode);
+        return true;
-        if (NewInterchangeableOpcode.empty())
-          return false;
-        LHS.swap(NewInterchangeableOpcode);
-        return true;
+        LHS.swap(NewInterchangeableOpcode);
+        return !LHS.empty();
-        if (NewInterchangeableOpcode.empty())
-          return false;
-        LHS.swap(NewInterchangeableOpcode);
-        return true;
+        LHS.swap(NewInterchangeableOpcode);
+        return !LHS.empty();
+      };
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
     auto *I = cast<Instruction>(VL[Cnt]);
     unsigned InstOpcode = I->getOpcode();
     if (IsBinOp && isa<BinaryOperator>(I)) {
-      if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+      SmallVector<InterchangeableInstruction> ThisInterchangeableOpcode(
+          getInterchangeableInstruction(I));
+      if (UpdateInterchangeableOpcode(InterchangeableOpcode,
+                                      ThisInterchangeableOpcode))
         continue;
-      if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
-          isValidForAlternation(Opcode)) {
-        AltOpcode = InstOpcode;
-        AltIndex = Cnt;
+      if (AlternateInterchangeableOpcode.empty()) {
+        InterchangeableOpcode.erase(
+            remove_if(InterchangeableOpcode,
+                      [](const InterchangeableInstruction &I) {
+                        return !isValidForAlternation(I.Opcode);
+                      }),
+            InterchangeableOpcode.end());
+        ThisInterchangeableOpcode.erase(
+            remove_if(ThisInterchangeableOpcode,
+                      [](const InterchangeableInstruction &I) {
+                        return !isValidForAlternation(I.Opcode);
+                      }),
+            ThisInterchangeableOpcode.end());
+        if (InterchangeableOpcode.empty() || ThisInterchangeableOpcode.empty())
+          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+        AlternateInterchangeableOpcode.swap(ThisInterchangeableOpcode);
         continue;
       }
+      if (UpdateInterchangeableOpcode(AlternateInterchangeableOpcode,
+                                      ThisInterchangeableOpcode))
+        continue;
     } else if (IsCastOp && isa<CastInst>(I)) {
       Value *Op0 = IBase->getOperand(0);
       Type *Ty0 = Op0->getType();
@@ -1043,6 +1172,21 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
   }
 
+  if (IsBinOp) {
+    auto FindOp = [&](ArrayRef<InterchangeableInstruction> CandidateOp) {
+      for (Value *V : VL)
+        for (const InterchangeableInstruction &I : CandidateOp)
+          if (cast<Instruction>(V)->getOpcode() == I.Opcode)
+            return cast<Instruction>(V);
+      llvm_unreachable(
+          "Cannot find the candidate instruction for InstructionsState.");
+    };
+    Instruction *MainOp = FindOp(InterchangeableOpcode);
+    Instruction *AltOp = AlternateInterchangeableOpcode.empty()
+                             ? MainOp
+                             : FindOp(AlternateInterchangeableOpcode);
+    return InstructionsState(VL[BaseIndex], MainOp, AltOp);
+  }
   return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
                            cast<Instruction>(VL[AltIndex]));
 }
@@ -2335,24 +2479,41 @@ class BoUpSLP {
                                  : cast<Instruction>(VL[0])->getNumOperands();
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
-      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+      InstructionsState S = getSameOpcode(VL, TLI);
+      for (unsigned OpIdx : seq<unsigned>(NumOperands))
         OpsVec[OpIdx].resize(NumLanes);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
-          // Our tree has just 3 nodes: the root and two operands.
-          // It is therefore trivial to get the APO. We only need to check the
-          // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
-          // RHS operand. The LHS operand of both add and sub is never attached
-          // to an inversese operation in the linearized form, therefore its APO
-          // is false. The RHS is true only if VL[Lane] is an inverse operation.
-
-          // Since operand reordering is performed on groups of commutative
-          // operations or alternating sequences (e.g., +, -), we can safely
-          // tell the inverse operations by checking commutativity.
-          bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
+      for (auto [I, V] : enumerate(VL)) {
+        assert(isa<Instruction>(V) && "Expected instruction");
+        SmallVector<InterchangeableInstruction> IIList =
+            getInterchangeableInstruction(cast<Instruction>(V));
+        Value *SelectedOp;
+        auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+          return II.Opcode == S.MainOp->getOpcode();
+        });
+        if (Iter == IIList.end()) {
+          Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+            return II.Opcode == S.AltOp->getOpcode();
+          });
+          SelectedOp = S.AltOp;
+        } else {
+          SelectedOp = S.MainOp;
+        }
+        assert(Iter != IIList.end() &&
+               "Cannot find an interchangeable instruction.");
+        // Our tree has just 3 nodes: the root and two operands.
+        // It is therefore trivial to get the APO. We only need to check the
+        // opcode of V and whether the operand at OpIdx is the LHS or RHS
+        // operand. The LHS operand of both add and sub is never attached to an
+        // inversese operation in the linearized form, therefore its APO is
+        // false. The RHS is true only if V is an inverse operation.
+
+        // Since operand reordering is performed on groups of commutative
+        // operations or alternating sequences (e.g., +, -), we can safely
+        // tell the inverse operations by checking commutativity.
+        bool IsInverseOperation = !isCommutative(cast<Instruction>(SelectedOp));
+        for (unsigned OpIdx : seq<unsigned>(NumOperands)) {
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
-          OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
-                                 APO, false};
+          OpsVec[OpIdx][I] = {Iter->Ops[OpIdx], APO, false};
         }
       }
     }
@@ -3252,15 +3413,25 @@ class BoUpSLP {
       auto *I0 = cast<Instruction>(Scalars[0]);
       Operands.resize(I0->getNumOperands());
       unsigned NumLanes = Scalars.size();
-      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
-           OpIdx != NumOperands; ++OpIdx) {
+      unsigned NumOperands = I0->getNumOperands();
+      for (unsigned OpIdx : seq<unsigned>(NumOperands))
         Operands[OpIdx].resize(NumLanes);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          auto *I = cast<Instruction>(Scalars[Lane]);
-          assert(I->getNumOperands() == NumOperands &&
-                 "Expected same number of operands");
-          Operands[OpIdx][Lane] = I->getOperand(OpIdx);
-        }
+      for (auto [I, V] : enumerate(Scalars)) {
+        SmallVector<InterchangeableInstruction> IIList =
+            getInterchangeableInstruction(cast<Instruction>(V));
+        auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+          return II.Opcode == MainOp->getOpcode();
+        });
+        if (Iter == IIList.end())
+          Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+            return II.Opcode == AltOp->getOpcode();
+          });
+        assert(Iter != IIList.end() &&
+               "Cannot find an interchangeable instruction.");
+        assert(Iter->Ops.size() == NumOperands &&
+               "Expected same number of operands");
+        for (auto [J, Op] : enumerate(Iter->Ops))
+          Operands[J][I] = Op;
       }
     }
 
@@ -14935,7 +15106,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *V = Builder.CreateBinOp(
           static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
           RHS);
-      propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
+      propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
       if (auto *I = dyn_cast<Instruction>(V)) {
         V = propagateMetadata(I, E->Scalars);
         // Drop nuw flags for abs(sub(commutative), true).

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
@@ -314,10 +314,10 @@ define void @store_try_reorder(ptr %dst) {
 ;
 ; POW2-ONLY-LABEL: @store_try_reorder(
 ; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[ADD:%.*]] = add i32 0, 0
-; POW2-ONLY-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; POW2-ONLY-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT:    [[ADD216:%.*]] = sub i32 0, 0
+; POW2-ONLY-NEXT:    [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
 entry:

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -7,13 +7,12 @@ define void @test(ptr %a, i64 %0) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i32 0
 ; CHECK-NEXT:    br label %[[BB:.*]]
 ; CHECK:       [[BB]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
-; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -324,10 +324,10 @@ define void @store_try_reorder(ptr %dst) {
 ;
 ; POW2-ONLY-LABEL: @store_try_reorder(
 ; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[ADD:%.*]] = add i32 0, 0
-; POW2-ONLY-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; POW2-ONLY-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT:    [[ADD216:%.*]] = sub i32 0, 0
+; POW2-ONLY-NEXT:    [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
 entry:

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
@@ -10,9 +10,7 @@ define i32 @foo(ptr nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @bar()
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], <i32 9, i32 9, i32 9, i32 9>
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    ret i32 undef

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
@@ -4,22 +4,17 @@
 define void @test(ptr %0, ptr %1, ptr %2) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP15:%.*]] = sub <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 2, i32 0, i32 1, i32 7>
-; CHECK-NEXT:    [[TMP17:%.*]] = add <4 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP20:%.*]] = add <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = sub <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    store <4 x i32> [[TMP22]], ptr [[TMP2:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> <i32 0, i32 0, i32 1, i32 0>, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP2:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i32, ptr %1, align 4