Skip to content

Commit b8b5105

Browse files
committed
AMDGPU: Use freeze poison instead of undef in alloca promotion
Previously the value created to represent the uninitialized memory of the alloca was undef. Use freeze poison instead. Enables some optimization improvements (which need defeating in the limit tests), but also a few regressions. Seems to leave behind dead code in some cases too.
1 parent f406b28 commit b8b5105

15 files changed

+279
-74
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,15 @@ static void forEachWorkListItem(const InstContainer &WorkList,
773773
}
774774
}
775775

776+
/// Find an insert point after an alloca, after all other allocas clustered at
777+
/// the start of the block.
778+
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
779+
BasicBlock::iterator I) {
780+
for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
781+
;
782+
return I;
783+
}
784+
776785
// FIXME: Should try to pick the most likely to be profitable allocas first.
777786
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
778787
LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -989,7 +998,16 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
989998
// undef.
990999
SSAUpdater Updater;
9911000
Updater.Initialize(VectorTy, "promotealloca");
992-
Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));
1001+
1002+
BasicBlock *EntryBB = Alloca.getParent();
1003+
BasicBlock::iterator InitInsertPos =
1004+
skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
1005+
// Alloca memory is undefined to begin, not poison.
1006+
Value *AllocaInitValue =
1007+
new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
1008+
AllocaInitValue->takeName(&Alloca);
1009+
1010+
Updater.AddAvailableValue(EntryBB, AllocaInitValue);
9931011

9941012
// First handle the initial worklist.
9951013
SmallVector<LoadInst *, 4> DeferredLoads;

llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,12 @@ define amdgpu_vs void @promote_store_aggr() #0 {
8787

8888
define amdgpu_vs void @promote_load_from_store_aggr() #0 {
8989
; CHECK-LABEL: @promote_load_from_store_aggr(
90+
; CHECK-NEXT: [[F1:%.*]] = freeze <2 x float> poison
9091
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
9192
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
9293
; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
9394
; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
94-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
95+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[F1]], float [[FOO3_FCA_0_EXTRACT]], i32 0
9596
; CHECK-NEXT: [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
9697
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i32 1
9798
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
@@ -131,8 +132,9 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
131132
; optimized out (variable %aliasTofoo3 in the test)
132133
define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
133134
; CHECK-LABEL: @promote_load_from_store_aggr_varoff(
135+
; CHECK-NEXT: [[F1:%.*]] = freeze <3 x i32> poison
134136
; CHECK-NEXT: [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
135-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[FOO3_UNPACK2]], i32 2
137+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> [[F1]], i32 [[FOO3_UNPACK2]], i32 2
136138
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
137139
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x i32> [[INPUT:%.*]], i32 [[TMP2]], i64 3
138140
; CHECK-NEXT: store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
@@ -152,6 +154,15 @@ define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
152154

153155
define amdgpu_vs void @promote_memmove_aggr() #0 {
154156
; CHECK-LABEL: @promote_memmove_aggr(
157+
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
158+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
159+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 0.000000e+00, i32 1
160+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <5 x float> [[TMP2]], float 0.000000e+00, i32 2
161+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 0.000000e+00, i32 3
162+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
163+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 1.000000e+00, i32 1
164+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 2.000000e+00, i32 3
165+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <5 x float> [[TMP7]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
155166
; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(1) @pv, align 4
156167
; CHECK-NEXT: ret void
157168
;
@@ -169,9 +180,16 @@ define amdgpu_vs void @promote_memmove_aggr() #0 {
169180

170181
define amdgpu_vs void @promote_memcpy_aggr() #0 {
171182
; CHECK-LABEL: @promote_memcpy_aggr(
183+
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
184+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
185+
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 1
186+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 2
187+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP9]], float 0.000000e+00, i32 3
188+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
189+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 2.000000e+00, i32 3
172190
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
173191
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
174-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, float 3.000000e+00, i32 [[FOO4]]
192+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[TMP6]], float 3.000000e+00, i32 [[FOO4]]
175193
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
176194
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
177195
; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4
@@ -300,9 +318,15 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0
300318

301319
define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
302320
; CHECK-LABEL: @promote_memcpy_inline_aggr(
321+
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
322+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
323+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 0.000000e+00, i32 1
324+
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 2
325+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 3
326+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
303327
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
304328
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
305-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> zeroinitializer, float 3.000000e+00, i32 [[FOO4]]
329+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[TMP5]], float 3.000000e+00, i32 [[FOO4]]
306330
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
307331
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
308332
; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4

llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ define amdgpu_kernel void @simple_users_scores() {
99
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
1010
; CHECK-NEXT: [[ENTRY:.*:]]
1111
; CHECK-NEXT: [[MANYUSERS:%.*]] = alloca [64 x i64], align 4, addrspace(5)
12+
; CHECK-NEXT: [[SIMPLEUSER:%.*]] = freeze <4 x i64> poison
13+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> [[SIMPLEUSER]], i64 42, i32 0
1214
; CHECK-NEXT: [[MANYUSERS_1:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 2
1315
; CHECK-NEXT: [[V0:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_1]], align 1
1416
; CHECK-NEXT: [[V0_EXT:%.*]] = zext i8 [[V0]] to i64

llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,19 @@ define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) {
77
; CHECK-LABEL: define amdgpu_kernel void @test_overwrite
88
; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
99
; CHECK-NEXT: entry:
10+
; CHECK-NEXT: [[STACK:%.*]] = freeze <3 x i64> poison
11+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x i64> [[STACK]], i64 43, i32 0
1012
; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
1113
; CHECK: loop:
12-
; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
13-
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
14-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 68, i32 0
15-
; CHECK-NEXT: [[TMP2]] = insertelement <3 x i64> [[TMP1]], i64 32, i32 0
16-
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 68
14+
; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP3:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
15+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
16+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 68, i32 0
17+
; CHECK-NEXT: [[TMP3]] = insertelement <3 x i64> [[TMP2]], i64 32, i32 0
18+
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP1]], 68
1719
; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
1820
; CHECK: end:
19-
; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
20-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
21+
; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP3]], [[LOOP]] ], [ [[TMP0]], [[ENTRY]] ]
22+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
2123
; CHECK-NEXT: ret void
2224
;
2325
entry:
@@ -42,8 +44,9 @@ define <4 x i64> @test_fullvec_out_of_bounds(<4 x i64> %arg) {
4244
; CHECK-LABEL: define <4 x i64> @test_fullvec_out_of_bounds
4345
; CHECK-SAME: (<4 x i64> [[ARG:%.*]]) {
4446
; CHECK-NEXT: entry:
47+
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i64> poison
4548
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i64> [[ARG]], i64 0
46-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i32 3
49+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[STACK]], i64 [[TMP0]], i32 3
4750
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[ARG]], i64 1
4851
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[ARG]], i64 2
4952
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[ARG]], i64 3
@@ -62,17 +65,19 @@ define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) {
6265
; CHECK-LABEL: define amdgpu_kernel void @test_no_overwrite
6366
; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
6467
; CHECK-NEXT: entry:
68+
; CHECK-NEXT: [[STACK:%.*]] = freeze <3 x i64> poison
69+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x i64> [[STACK]], i64 43, i32 0
6570
; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
6671
; CHECK: loop:
67-
; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
68-
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
69-
; CHECK-NEXT: [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 32, i32 1
70-
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 32
72+
; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
73+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
74+
; CHECK-NEXT: [[TMP2]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 32, i32 1
75+
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP1]], 32
7176
; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
7277
; CHECK: end:
73-
; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
74-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
75-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 1
78+
; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ [[TMP0]], [[ENTRY]] ]
79+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
80+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 1
7681
; CHECK-NEXT: ret void
7782
;
7883
entry:
@@ -97,6 +102,7 @@ define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
97102
; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
98103
; CHECK-SAME: (ptr [[ARG:%.*]]) {
99104
; CHECK-NEXT: entry:
105+
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x i8> poison
100106
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
101107
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8>
102108
; CHECK-NEXT: ret ptr [[ARG]]
@@ -112,6 +118,7 @@ define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg
112118
; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec
113119
; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
114120
; CHECK-NEXT: entry:
121+
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <4 x i8> poison
115122
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[ARG]] to i32
116123
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[TMP0]] to <4 x i8>
117124
; CHECK-NEXT: ret ptr addrspace(3) [[ARG]]
@@ -127,6 +134,7 @@ define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr>
127134
; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
128135
; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
129136
; CHECK-NEXT: entry:
137+
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <4 x i32> poison
130138
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
131139
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
132140
; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
@@ -143,6 +151,7 @@ define <8 x i16> @ptralloca_load_store_ints_full(<2 x i64> %arg) {
143151
; CHECK-LABEL: define <8 x i16> @ptralloca_load_store_ints_full
144152
; CHECK-SAME: (<2 x i64> [[ARG:%.*]]) {
145153
; CHECK-NEXT: entry:
154+
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x ptr addrspace(5)> poison
146155
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[ARG]] to <4 x i32>
147156
; CHECK-NEXT: [[TMP1:%.*]] = inttoptr <4 x i32> [[TMP0]] to <4 x ptr addrspace(5)>
148157
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x i16>
@@ -159,19 +168,22 @@ define void @alloca_load_store_ptr_mixed_ptrvec(<2 x ptr addrspace(3)> %arg) {
159168
; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_ptrvec
160169
; CHECK-SAME: (<2 x ptr addrspace(3)> [[ARG:%.*]]) {
161170
; CHECK-NEXT: entry:
171+
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x i32> poison
162172
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[ARG]] to <2 x i32>
163173
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i64 0
164-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 0
174+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[ALLOCA]], i32 [[TMP1]], i32 0
165175
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i64 1
166176
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP3]], i32 1
167177
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0
168178
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP3]], i64 1
169179
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x ptr addrspace(3)>
170180
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0
171181
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3]], i64 1
172-
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 undef, i64 2
173-
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 undef, i64 3
174-
; CHECK-NEXT: [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x ptr addrspace(3)>
182+
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
183+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i64 2
184+
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
185+
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 3
186+
; CHECK-NEXT: [[TMP14:%.*]] = inttoptr <4 x i32> [[TMP13]] to <4 x ptr addrspace(3)>
175187
; CHECK-NEXT: ret void
176188
;
177189
entry:

0 commit comments

Comments
 (0)