llvm · PeddleSpam · Feb 26, 2025 · Feb 26, 2025 · Feb 28, 2025 · Mar 4, 2025
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 // CHECK-LABEL: define dso_local spir_kernel void @foo(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -32,8 +33,10 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <numeric>
+#include <optional>
 #include <queue>
 #include <set>
+#include <tuple>
 
 #define DEBUG_TYPE "vector-combine"
 #include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -131,6 +134,7 @@ class VectorCombine {
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
   bool shrinkType(Instruction &I);
+  bool shrinkLoadForShuffles(Instruction &I);
 
   void replaceValue(Value &Old, Value &New) {
     LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@@ -3483,6 +3487,131 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
   return true;
 }
 
+// Attempt to shrink loads that are only used by shufflevector instructions.
+bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
+  auto *OldLoad = dyn_cast<LoadInst>(&I);
+  if (!OldLoad || !OldLoad->isSimple())
+    return false;
+
+  auto *VecTy = dyn_cast<FixedVectorType>(OldLoad->getType());
+  if (!VecTy)
+    return false;
+
+  // Search all uses of load. If all uses are shufflevector instructions, and
+  // the second operands are all poison values, find the minimum and maximum
+  // indices of the vector elements referenced by all shuffle masks.
+  // Otherwise return `std::nullopt`.
+  using IndexRange = std::pair<int, int>;
+  auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
+    IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1);
+    for (auto &Use : I.uses()) {
+      // All uses must be shufflevector instructions.
+      auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
+      if (!Shuffle)
+        return std::nullopt;
+
+      // Ignore shufflevector instructions that have no uses.
+      if (!Shuffle->hasNUsesOrMore(1u))
+        continue;
+
+      // Ensure second operand is a poison value.
+      auto *Op0 = Shuffle->getOperand(0);
+      auto *Op1 = Shuffle->getOperand(1);
+      if (!isa<PoisonValue>(Op1) && !isa<UndefValue>(Op1))
+        return std::nullopt;
+
+      // Find the min and max indices used by the shufflevector instruction.
+      ArrayRef<int> Mask = Shuffle->getShuffleMask();
+      auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
+      auto NumElems = int(Op0Ty->getNumElements());
+
+      for (int Index : Mask) {
+        if (Index >= 0) {
+          Index %= NumElems;
+          OutputRange.first = std::min(Index, OutputRange.first);
+          OutputRange.second = std::max(Index, OutputRange.second);
+        }
+      }
+    }
+
+    if (OutputRange.second < OutputRange.first)
+      return std::nullopt;
+
+    return OutputRange;
+  };
+
+  // Get the range of vector elements used by shufflevector instructions.
+  if (auto Indices = GetIndexRangeInShuffles()) {
+    unsigned OldSize = VecTy->getNumElements();
+    unsigned NewSize = Indices->second + 1u;
+
+    // If the range of vector elements is smaller than the full load, attempt
+    // to create a smaller load.
+    if (NewSize < OldSize) {
+      auto Builder = IRBuilder(&I);
+      Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+      // Create new load of smaller vector.
+      auto *ElemTy = VecTy->getElementType();
+      auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
+      auto *PtrOp = OldLoad->getPointerOperand();
+      auto *NewLoad = cast<LoadInst>(
+          Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign()));
+      NewLoad->copyMetadata(I);
+
+      // Calculate costs of old and new ops.
+      auto OldCost = TTI.getMemoryOpCost(
+          Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
+          OldLoad->getPointerAddressSpace(), CostKind);
+      auto NewCost = TTI.getMemoryOpCost(
+          Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
+          NewLoad->getPointerAddressSpace(), CostKind);
+
+      using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
+      auto NewUses = SmallVector<UseEntry, 4u>();
+      auto SizeDiff = OldSize - NewSize;
+
+      for (auto &Use : I.uses()) {
+        auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+        auto OldMask = Shuffle->getShuffleMask();
+
+        // Create entry for new use.
+        NewUses.push_back({Shuffle, {}});
+        auto &NewMask = NewUses.back().second;
+        for (auto Index : OldMask)
+          NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index);
+
+        // Update costs.
+        OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, OldMask,
+                                      CostKind);
+        NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy,
+                                      NewMask, CostKind);
+      }
+
+      if (OldCost < NewCost || !NewCost.isValid()) {
+        NewLoad->eraseFromParent();
+        return false;
+      }
+
+      // Replace all uses.
+      for (auto &Use : NewUses) {
+        auto *Shuffle = Use.first;
+        auto &NewMask = Use.second;
+
+        Builder.SetInsertPoint(Shuffle);
+        Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
+        auto *NewShuffle = Builder.CreateShuffleVector(
+            NewLoad, PoisonValue::get(NewVecTy), NewMask);
+
+        replaceValue(*Shuffle, *NewShuffle);
+      }
+
+      return true;
+    }
+  }
+  return false;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -3558,6 +3687,9 @@ bool VectorCombine::run() {
         MadeChange |= foldSelectShuffle(I);
         MadeChange |= foldShuffleToIdentity(I);
         break;
+      case Instruction::Load:
+        MadeChange |= shrinkLoadForShuffles(I);
+        break;
       case Instruction::BitCast:
         MadeChange |= foldBitcastShuffle(I);
         break;

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
 
 define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
 ; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; SSE-NEXT:    ret <4 x float> [[TMP3]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; AVX-NEXT:    ret <4 x float> [[TMP3]]
 ;
   %2 = alloca ptr, align 8

diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -252,8 +252,7 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
 define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -341,8 +340,7 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
 define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0

diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
 
 define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
 ; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <4 x i32> [[S]]
 ;
   %l = load <2 x i32>, ptr %p, align 1

diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,21 +47,12 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
 ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
 
 define  <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1)  {
-; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; SSE-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; SSE-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; SSE-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
-; SSE-NEXT:    ret <4 x double> [[BLEND]]
-;
-; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; AVX-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; AVX-NEXT:    [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT:    [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX-NEXT:    ret <4 x double> [[BLEND]]
+; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
+; CHECK-NEXT:    [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x double> [[BLEND]]
 ;
   %ld0 = load <4 x double>, ptr %p0, align 32
   %ld1 = load <4 x double>, ptr %p1, align 32
@@ -81,3 +72,6 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
   %s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
   ret <2 x float> %s2
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
+; SSE: {{.*}}