Skip to content

Commit 688c3ff

Browse files
committed
[X86] masked_gather_scatter.ll - add bass gather from struct tests for #130920
Show current codegen for base reference
1 parent d47401e commit 688c3ff

File tree

1 file changed

+306
-0
lines changed

1 file changed

+306
-0
lines changed

llvm/test/CodeGen/X86/masked_gather_scatter.ll

Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5245,3 +5245,309 @@ bb:
52455245
ret <8 x i64> %tmp1
52465246
}
52475247
declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
5248+
5249+
; Test gathers from struct
5250+
%struct.pt = type { float, float, float, i32 }
5251+
5252+
define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 x i1> %mask, <16 x float> %src0) {
5253+
; KNL_64-LABEL: test_gather_structpt_16f32_mask_index:
5254+
; KNL_64: # %bb.0:
5255+
; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
5256+
; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
5257+
; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
5258+
; KNL_64-NEXT: vmovdqu64 (%rsi), %zmm0
5259+
; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5260+
; KNL_64-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5261+
; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5262+
; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5263+
; KNL_64-NEXT: vpsllq $4, %zmm0, %zmm0
5264+
; KNL_64-NEXT: vpsllq $4, %zmm2, %zmm2
5265+
; KNL_64-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5266+
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
5267+
; KNL_64-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
5268+
; KNL_64-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
5269+
; KNL_64-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5270+
; KNL_64-NEXT: retq
5271+
;
5272+
; KNL_32-LABEL: test_gather_structpt_16f32_mask_index:
5273+
; KNL_32: # %bb.0:
5274+
; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
5275+
; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
5276+
; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
5277+
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5278+
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5279+
; KNL_32-NEXT: vpslld $4, (%ecx), %zmm0
5280+
; KNL_32-NEXT: vgatherdps (%eax,%zmm0), %zmm1 {%k1}
5281+
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
5282+
; KNL_32-NEXT: retl
5283+
;
5284+
; SKX_SMALL-LABEL: test_gather_structpt_16f32_mask_index:
5285+
; SKX_SMALL: # %bb.0:
5286+
; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
5287+
; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
5288+
; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
5289+
; SKX_SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
5290+
; SKX_SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5291+
; SKX_SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5292+
; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5293+
; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5294+
; SKX_SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
5295+
; SKX_SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
5296+
; SKX_SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5297+
; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2
5298+
; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
5299+
; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
5300+
; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5301+
; SKX_SMALL-NEXT: retq
5302+
;
5303+
; SKX_LARGE-LABEL: test_gather_structpt_16f32_mask_index:
5304+
; SKX_LARGE: # %bb.0:
5305+
; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
5306+
; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
5307+
; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
5308+
; SKX_LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
5309+
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
5310+
; SKX_LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
5311+
; SKX_LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5312+
; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5313+
; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5314+
; SKX_LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
5315+
; SKX_LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
5316+
; SKX_LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5317+
; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2
5318+
; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
5319+
; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
5320+
; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5321+
; SKX_LARGE-NEXT: retq
5322+
;
5323+
; SKX_32-LABEL: test_gather_structpt_16f32_mask_index:
5324+
; SKX_32: # %bb.0:
5325+
; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
5326+
; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
5327+
; SKX_32-NEXT: vpmovd2m %zmm0, %k1
5328+
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5329+
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5330+
; SKX_32-NEXT: vpslld $4, (%ecx), %zmm0
5331+
; SKX_32-NEXT: vgatherdps (%eax,%zmm0), %zmm1 {%k1}
5332+
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
5333+
; SKX_32-NEXT: retl
5334+
%wide.load = load <16 x i32>, ptr %arr, align 4
5335+
%and = and <16 x i32> %wide.load, <i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911>
5336+
%zext = zext <16 x i32> %and to <16 x i64>
5337+
%ptrs = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %zext
5338+
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
5339+
ret <16 x float> %res
5340+
}
5341+
5342+
define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %arr, <16 x i1> %mask, <16 x float> %src0) {
5343+
; KNL_64-LABEL: test_gather_structpt_16f32_mask_index_offset:
5344+
; KNL_64: # %bb.0:
5345+
; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
5346+
; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
5347+
; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
5348+
; KNL_64-NEXT: vmovdqu64 (%rsi), %zmm0
5349+
; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5350+
; KNL_64-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5351+
; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5352+
; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5353+
; KNL_64-NEXT: vpsllq $4, %zmm0, %zmm0
5354+
; KNL_64-NEXT: vpsllq $4, %zmm2, %zmm2
5355+
; KNL_64-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5356+
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
5357+
; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
5358+
; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
5359+
; KNL_64-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5360+
; KNL_64-NEXT: retq
5361+
;
5362+
; KNL_32-LABEL: test_gather_structpt_16f32_mask_index_offset:
5363+
; KNL_32: # %bb.0:
5364+
; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
5365+
; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
5366+
; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
5367+
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5368+
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5369+
; KNL_32-NEXT: vpslld $4, (%ecx), %zmm0
5370+
; KNL_32-NEXT: vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
5371+
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
5372+
; KNL_32-NEXT: retl
5373+
;
5374+
; SKX_SMALL-LABEL: test_gather_structpt_16f32_mask_index_offset:
5375+
; SKX_SMALL: # %bb.0:
5376+
; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
5377+
; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
5378+
; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
5379+
; SKX_SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
5380+
; SKX_SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5381+
; SKX_SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5382+
; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5383+
; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5384+
; SKX_SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
5385+
; SKX_SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
5386+
; SKX_SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5387+
; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2
5388+
; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
5389+
; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
5390+
; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5391+
; SKX_SMALL-NEXT: retq
5392+
;
5393+
; SKX_LARGE-LABEL: test_gather_structpt_16f32_mask_index_offset:
5394+
; SKX_LARGE: # %bb.0:
5395+
; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
5396+
; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
5397+
; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
5398+
; SKX_LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
5399+
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
5400+
; SKX_LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
5401+
; SKX_LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5402+
; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5403+
; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5404+
; SKX_LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
5405+
; SKX_LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
5406+
; SKX_LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5407+
; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2
5408+
; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
5409+
; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
5410+
; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5411+
; SKX_LARGE-NEXT: retq
5412+
;
5413+
; SKX_32-LABEL: test_gather_structpt_16f32_mask_index_offset:
5414+
; SKX_32: # %bb.0:
5415+
; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
5416+
; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
5417+
; SKX_32-NEXT: vpmovd2m %zmm0, %k1
5418+
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5419+
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5420+
; SKX_32-NEXT: vpslld $4, (%ecx), %zmm0
5421+
; SKX_32-NEXT: vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
5422+
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
5423+
; SKX_32-NEXT: retl
5424+
%wide.load = load <16 x i32>, ptr %arr, align 4
5425+
%and = and <16 x i32> %wide.load, <i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911>
5426+
%zext = zext <16 x i32> %and to <16 x i64>
5427+
%ptrs = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %zext, i32 1
5428+
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
5429+
ret <16 x float> %res
5430+
}
5431+
5432+
define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, ptr %arr, <16 x i1> %mask, <16 x float> %src0) {
5433+
; KNL_64-LABEL: test_gather_16f32_mask_index_pair:
5434+
; KNL_64: # %bb.0:
5435+
; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
5436+
; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
5437+
; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
5438+
; KNL_64-NEXT: vmovdqu64 (%rsi), %zmm0
5439+
; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5440+
; KNL_64-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5441+
; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5442+
; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5443+
; KNL_64-NEXT: vpsllq $4, %zmm0, %zmm3
5444+
; KNL_64-NEXT: vpsllq $4, %zmm2, %zmm2
5445+
; KNL_64-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5446+
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
5447+
; KNL_64-NEXT: kmovw %k2, %k3
5448+
; KNL_64-NEXT: vmovaps %ymm4, %ymm0
5449+
; KNL_64-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5450+
; KNL_64-NEXT: vmovaps %ymm1, %ymm5
5451+
; KNL_64-NEXT: kmovw %k1, %k3
5452+
; KNL_64-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5453+
; KNL_64-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5454+
; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5455+
; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5456+
; KNL_64-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5457+
; KNL_64-NEXT: retq
5458+
;
5459+
; KNL_32-LABEL: test_gather_16f32_mask_index_pair:
5460+
; KNL_32: # %bb.0:
5461+
; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
5462+
; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
5463+
; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
5464+
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5465+
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5466+
; KNL_32-NEXT: vpslld $4, (%ecx), %zmm2
5467+
; KNL_32-NEXT: vpbroadcastd %eax, %zmm0
5468+
; KNL_32-NEXT: vpaddd %zmm2, %zmm0, %zmm3
5469+
; KNL_32-NEXT: kmovw %k1, %k2
5470+
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
5471+
; KNL_32-NEXT: vgatherdps (%eax,%zmm2), %zmm0 {%k2}
5472+
; KNL_32-NEXT: vgatherdps 4(,%zmm3), %zmm1 {%k1}
5473+
; KNL_32-NEXT: retl
5474+
;
5475+
; SKX_SMALL-LABEL: test_gather_16f32_mask_index_pair:
5476+
; SKX_SMALL: # %bb.0:
5477+
; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
5478+
; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
5479+
; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
5480+
; SKX_SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
5481+
; SKX_SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5482+
; SKX_SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5483+
; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5484+
; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5485+
; SKX_SMALL-NEXT: vpsllq $4, %zmm0, %zmm3
5486+
; SKX_SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
5487+
; SKX_SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5488+
; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2
5489+
; SKX_SMALL-NEXT: kmovw %k2, %k3
5490+
; SKX_SMALL-NEXT: vmovaps %ymm4, %ymm0
5491+
; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5492+
; SKX_SMALL-NEXT: vmovaps %ymm1, %ymm5
5493+
; SKX_SMALL-NEXT: kmovw %k1, %k3
5494+
; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5495+
; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5496+
; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5497+
; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5498+
; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5499+
; SKX_SMALL-NEXT: retq
5500+
;
5501+
; SKX_LARGE-LABEL: test_gather_16f32_mask_index_pair:
5502+
; SKX_LARGE: # %bb.0:
5503+
; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
5504+
; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
5505+
; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
5506+
; SKX_LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
5507+
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
5508+
; SKX_LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
5509+
; SKX_LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5510+
; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5511+
; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5512+
; SKX_LARGE-NEXT: vpsllq $4, %zmm0, %zmm3
5513+
; SKX_LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
5514+
; SKX_LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5515+
; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2
5516+
; SKX_LARGE-NEXT: vmovaps %ymm4, %ymm0
5517+
; SKX_LARGE-NEXT: kmovw %k2, %k3
5518+
; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5519+
; SKX_LARGE-NEXT: vmovaps %ymm1, %ymm5
5520+
; SKX_LARGE-NEXT: kmovw %k1, %k3
5521+
; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5522+
; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5523+
; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5524+
; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5525+
; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5526+
; SKX_LARGE-NEXT: retq
5527+
;
5528+
; SKX_32-LABEL: test_gather_16f32_mask_index_pair:
5529+
; SKX_32: # %bb.0:
5530+
; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
5531+
; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
5532+
; SKX_32-NEXT: vpmovd2m %zmm0, %k1
5533+
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5534+
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5535+
; SKX_32-NEXT: vpslld $4, (%ecx), %zmm2
5536+
; SKX_32-NEXT: vpbroadcastd %eax, %zmm0
5537+
; SKX_32-NEXT: vpaddd %zmm2, %zmm0, %zmm3
5538+
; SKX_32-NEXT: kmovw %k1, %k2
5539+
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
5540+
; SKX_32-NEXT: vgatherdps (%eax,%zmm2), %zmm0 {%k2}
5541+
; SKX_32-NEXT: vgatherdps 4(,%zmm3), %zmm1 {%k1}
5542+
; SKX_32-NEXT: retl
5543+
%wide.load = load <16 x i32>, ptr %arr, align 4
5544+
%and = and <16 x i32> %wide.load, <i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911>
5545+
%zext = zext <16 x i32> %and to <16 x i64>
5546+
%ptrs1 = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %zext
5547+
%res1 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs1, i32 4, <16 x i1> %mask, <16 x float> %src0)
5548+
%ptrs = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %zext, i32 1
5549+
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
5550+
%pair1 = insertvalue {<16 x float>, <16 x float>} undef, <16 x float> %res1, 0
5551+
%pair2 = insertvalue {<16 x float>, <16 x float>} %pair1, <16 x float> %res, 1
5552+
ret {<16 x float>, <16 x float>} %pair2
5553+
}

0 commit comments

Comments
 (0)