llvm · arsenm · Mar 18, 2025 · Mar 14, 2025 · Pierre-vh · Mar 14, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -773,6 +773,15 @@ static void forEachWorkListItem(const InstContainer &WorkList,
   }
 }
 
+/// Find an insert point after an alloca, after all other allocas clustered at
+/// the start of the block.
+static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
+                                                    BasicBlock::iterator I) {
+  for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
+    ;
+  return I;
+}
+
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -989,7 +998,16 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   // undef.
   SSAUpdater Updater;
   Updater.Initialize(VectorTy, "promotealloca");
-  Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));
+
+  BasicBlock *EntryBB = Alloca.getParent();
+  BasicBlock::iterator InitInsertPos =
+      skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
+  // Alloca memory is undefined to begin, not poison.
+  Value *AllocaInitValue =
+      new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
+  AllocaInitValue->takeName(&Alloca);
+
+  Updater.AddAvailableValue(EntryBB, AllocaInitValue);
 
   // First handle the initial worklist.
   SmallVector<LoadInst *, 4> DeferredLoads;

diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -87,11 +87,12 @@ define amdgpu_vs void @promote_store_aggr() #0 {
 
 define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; CHECK-LABEL: @promote_load_from_store_aggr(
+; CHECK-NEXT:    [[F1:%.*]] = freeze <2 x float> poison
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
 ; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[F1]], float [[FOO3_FCA_0_EXTRACT]], i32 0
 ; CHECK-NEXT:    [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
@@ -131,8 +132,9 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; optimized out (variable %aliasTofoo3 in the test)
 define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
 ; CHECK-LABEL: @promote_load_from_store_aggr_varoff(
+; CHECK-NEXT:    [[F1:%.*]] = freeze <3 x i32> poison
 ; CHECK-NEXT:    [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[FOO3_UNPACK2]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> [[F1]], i32 [[FOO3_UNPACK2]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
 ; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x i32> [[INPUT:%.*]], i32 [[TMP2]], i64 3
 ; CHECK-NEXT:    store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
@@ -152,6 +154,15 @@ define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
 
 define amdgpu_vs void @promote_memmove_aggr() #0 {
 ; CHECK-LABEL: @promote_memmove_aggr(
+; CHECK-NEXT:    [[F1:%.*]] = freeze <5 x float> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <5 x float> [[TMP2]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 1.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 2.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <5 x float> [[TMP7]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
 ; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -169,9 +180,16 @@ define amdgpu_vs void @promote_memmove_aggr() #0 {
 
 define amdgpu_vs void @promote_memcpy_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_aggr(
+; CHECK-NEXT:    [[F1:%.*]] = freeze <5 x float> poison
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP9]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 2.000000e+00, i32 3
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> [[TMP6]], float 3.000000e+00, i32 [[FOO4]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
@@ -300,9 +318,15 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0
 
 define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_inline_aggr(
+; CHECK-NEXT:    [[F1:%.*]] = freeze <5 x float> poison
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> zeroinitializer, float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> [[TMP5]], float 3.000000e+00, i32 [[FOO4]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4

diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @simple_users_scores() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MANYUSERS:%.*]] = alloca [64 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    [[SIMPLEUSER:%.*]] = freeze <4 x i64> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> [[SIMPLEUSER]], i64 42, i32 0
 ; CHECK-NEXT:    [[MANYUSERS_1:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 2
 ; CHECK-NEXT:    [[V0:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_1]], align 1
 ; CHECK-NEXT:    [[V0_EXT:%.*]] = zext i8 [[V0]] to i64

diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
@@ -7,17 +7,19 @@ define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) {
 ; CHECK-LABEL: define amdgpu_kernel void @test_overwrite
 ; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <3 x i64> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <3 x i64> [[STACK]], i64 43, i32 0
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 68, i32 0
-; CHECK-NEXT:    [[TMP2]] = insertelement <3 x i64> [[TMP1]], i64 32, i32 0
-; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 68
+; CHECK-NEXT:    [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP3:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 68, i32 0
+; CHECK-NEXT:    [[TMP3]] = insertelement <3 x i64> [[TMP2]], i64 32, i32 0
+; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[TMP1]], 68
 ; CHECK-NEXT:    br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
+; CHECK-NEXT:    [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP3]], [[LOOP]] ], [ [[TMP0]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -42,8 +44,9 @@ define <4 x i64> @test_fullvec_out_of_bounds(<4 x i64> %arg) {
 ; CHECK-LABEL: define <4 x i64> @test_fullvec_out_of_bounds
 ; CHECK-SAME: (<4 x i64> [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i64> poison
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i64> [[ARG]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[STACK]], i64 [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[ARG]], i64 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[ARG]], i64 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[ARG]], i64 3
@@ -62,17 +65,19 @@ define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) {
 ; CHECK-LABEL: define amdgpu_kernel void @test_no_overwrite
 ; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <3 x i64> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <3 x i64> [[STACK]], i64 43, i32 0
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
-; CHECK-NEXT:    [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 32, i32 1
-; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 32
+; CHECK-NEXT:    [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT:    [[TMP2]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 32, i32 1
+; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[TMP1]], 32
 ; CHECK-NEXT:    br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 1
+; CHECK-NEXT:    [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ [[TMP0]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -97,6 +102,7 @@ define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
 ; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
 ; CHECK-SAME: (ptr [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <8 x i8> poison
 ; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8>
 ; CHECK-NEXT:    ret ptr [[ARG]]
@@ -112,6 +118,7 @@ define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg
 ; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec
 ; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <4 x i8> poison
 ; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[ARG]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to <4 x i8>
 ; CHECK-NEXT:    ret ptr addrspace(3) [[ARG]]
@@ -127,6 +134,7 @@ define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr>
 ; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
 ; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <4 x i32> poison
 ; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
@@ -143,6 +151,7 @@ define <8 x i16> @ptralloca_load_store_ints_full(<2 x i64> %arg) {
 ; CHECK-LABEL: define <8 x i16> @ptralloca_load_store_ints_full
 ; CHECK-SAME: (<2 x i64> [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x ptr addrspace(5)> poison
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[ARG]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr <4 x i32> [[TMP0]] to <4 x ptr addrspace(5)>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x i16>
@@ -159,19 +168,22 @@ define void @alloca_load_store_ptr_mixed_ptrvec(<2 x ptr addrspace(3)> %arg) {
 ; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_ptrvec
 ; CHECK-SAME: (<2 x ptr addrspace(3)> [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <8 x i32> poison
 ; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[ARG]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[ALLOCA]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP3]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x ptr addrspace(3)>
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 undef, i64 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 undef, i64 3
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x ptr addrspace(3)>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 3
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr <4 x i32> [[TMP13]] to <4 x ptr addrspace(3)>
 ; CHECK-NEXT:    ret void
 ;
 entry: