Skip to content

AMDGPU: Use freeze poison instead of undef in alloca promotion #131285

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,15 @@ static void forEachWorkListItem(const InstContainer &WorkList,
}
}

/// Find an insert point after an alloca, after all other allocas clustered at
/// the start of the block.
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
BasicBlock::iterator I) {
for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
;
return I;
}

// FIXME: Should try to pick the most likely to be profitable allocas first.
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
Expand Down Expand Up @@ -989,7 +998,16 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
// undef.
SSAUpdater Updater;
Updater.Initialize(VectorTy, "promotealloca");
Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));

BasicBlock *EntryBB = Alloca.getParent();
BasicBlock::iterator InitInsertPos =
skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
// Alloca memory is undefined to begin, not poison.
Value *AllocaInitValue =
new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is going to become a common pattern to replace undef, we should probably make a helper for it eventually
Maybe a static helper like FreezeInst::getPoison(Ty, "", pos) ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I doubt this is going to be that common

AllocaInitValue->takeName(&Alloca);

Updater.AddAvailableValue(EntryBB, AllocaInitValue);

// First handle the initial worklist.
SmallVector<LoadInst *, 4> DeferredLoads;
Expand Down
32 changes: 28 additions & 4 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,12 @@ define amdgpu_vs void @promote_store_aggr() #0 {

define amdgpu_vs void @promote_load_from_store_aggr() #0 {
; CHECK-LABEL: @promote_load_from_store_aggr(
; CHECK-NEXT: [[F1:%.*]] = freeze <2 x float> poison
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[F1]], float [[FOO3_FCA_0_EXTRACT]], i32 0
; CHECK-NEXT: [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
Expand Down Expand Up @@ -131,8 +132,9 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
; optimized out (variable %aliasTofoo3 in the test)
define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
; CHECK-LABEL: @promote_load_from_store_aggr_varoff(
; CHECK-NEXT: [[F1:%.*]] = freeze <3 x i32> poison
; CHECK-NEXT: [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[FOO3_UNPACK2]], i32 2
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> [[F1]], i32 [[FOO3_UNPACK2]], i32 2
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x i32> [[INPUT:%.*]], i32 [[TMP2]], i64 3
; CHECK-NEXT: store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
Expand All @@ -152,6 +154,15 @@ define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {

define amdgpu_vs void @promote_memmove_aggr() #0 {
; CHECK-LABEL: @promote_memmove_aggr(
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 0.000000e+00, i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <5 x float> [[TMP2]], float 0.000000e+00, i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 0.000000e+00, i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 1.000000e+00, i32 1
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 2.000000e+00, i32 3
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <5 x float> [[TMP7]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(1) @pv, align 4
; CHECK-NEXT: ret void
;
Expand All @@ -169,9 +180,16 @@ define amdgpu_vs void @promote_memmove_aggr() #0 {

define amdgpu_vs void @promote_memcpy_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_aggr(
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 1
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP9]], float 0.000000e+00, i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 2.000000e+00, i32 3
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, float 3.000000e+00, i32 [[FOO4]]
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[TMP6]], float 3.000000e+00, i32 [[FOO4]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4
Expand Down Expand Up @@ -300,9 +318,15 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0

define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_inline_aggr(
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 0.000000e+00, i32 1
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> zeroinitializer, float 3.000000e+00, i32 [[FOO4]]
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[TMP5]], float 3.000000e+00, i32 [[FOO4]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ define amdgpu_kernel void @simple_users_scores() {
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[MANYUSERS:%.*]] = alloca [64 x i64], align 4, addrspace(5)
; CHECK-NEXT: [[SIMPLEUSER:%.*]] = freeze <4 x i64> poison
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> [[SIMPLEUSER]], i64 42, i32 0
; CHECK-NEXT: [[MANYUSERS_1:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 2
; CHECK-NEXT: [[V0:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_1]], align 1
; CHECK-NEXT: [[V0_EXT:%.*]] = zext i8 [[V0]] to i64
Expand Down
50 changes: 31 additions & 19 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,19 @@ define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) {
; CHECK-LABEL: define amdgpu_kernel void @test_overwrite
; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[STACK:%.*]] = freeze <3 x i64> poison
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x i64> [[STACK]], i64 43, i32 0
; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 68, i32 0
; CHECK-NEXT: [[TMP2]] = insertelement <3 x i64> [[TMP1]], i64 32, i32 0
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 68
; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP3:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 68, i32 0
; CHECK-NEXT: [[TMP3]] = insertelement <3 x i64> [[TMP2]], i64 32, i32 0
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP1]], 68
; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
; CHECK: end:
; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP3]], [[LOOP]] ], [ [[TMP0]], [[ENTRY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
; CHECK-NEXT: ret void
;
entry:
Expand All @@ -42,8 +44,9 @@ define <4 x i64> @test_fullvec_out_of_bounds(<4 x i64> %arg) {
; CHECK-LABEL: define <4 x i64> @test_fullvec_out_of_bounds
; CHECK-SAME: (<4 x i64> [[ARG:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i64> poison
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i64> [[ARG]], i64 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i32 3
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[STACK]], i64 [[TMP0]], i32 3
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[ARG]], i64 1
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[ARG]], i64 2
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[ARG]], i64 3
Expand All @@ -62,17 +65,19 @@ define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) {
; CHECK-LABEL: define amdgpu_kernel void @test_no_overwrite
; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[STACK:%.*]] = freeze <3 x i64> poison
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x i64> [[STACK]], i64 43, i32 0
; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
; CHECK-NEXT: [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 32, i32 1
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 32
; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
; CHECK-NEXT: [[TMP2]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 32, i32 1
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP1]], 32
; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
; CHECK: end:
; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 1
; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ [[TMP0]], [[ENTRY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 1
; CHECK-NEXT: ret void
;
entry:
Expand All @@ -97,6 +102,7 @@ define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
; CHECK-SAME: (ptr [[ARG:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x i8> poison
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8>
; CHECK-NEXT: ret ptr [[ARG]]
Expand All @@ -112,6 +118,7 @@ define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg
; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec
; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <4 x i8> poison
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[ARG]] to i32
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[TMP0]] to <4 x i8>
; CHECK-NEXT: ret ptr addrspace(3) [[ARG]]
Expand All @@ -127,6 +134,7 @@ define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr>
; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <4 x i32> poison
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
Expand All @@ -143,6 +151,7 @@ define <8 x i16> @ptralloca_load_store_ints_full(<2 x i64> %arg) {
; CHECK-LABEL: define <8 x i16> @ptralloca_load_store_ints_full
; CHECK-SAME: (<2 x i64> [[ARG:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x ptr addrspace(5)> poison
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[ARG]] to <4 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = inttoptr <4 x i32> [[TMP0]] to <4 x ptr addrspace(5)>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x i16>
Expand All @@ -159,19 +168,22 @@ define void @alloca_load_store_ptr_mixed_ptrvec(<2 x ptr addrspace(3)> %arg) {
; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_ptrvec
; CHECK-SAME: (<2 x ptr addrspace(3)> [[ARG:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x i32> poison
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[ARG]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i64 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[ALLOCA]], i32 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP3]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP3]], i64 1
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x ptr addrspace(3)>
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3]], i64 1
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 undef, i64 2
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 undef, i64 3
; CHECK-NEXT: [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x ptr addrspace(3)>
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i64 2
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 3
; CHECK-NEXT: [[TMP14:%.*]] = inttoptr <4 x i32> [[TMP13]] to <4 x ptr addrspace(3)>
; CHECK-NEXT: ret void
;
entry:
Expand Down
Loading
Loading