@@ -5245,3 +5245,309 @@ bb:
5245
5245
ret <8 x i64 > %tmp1
5246
5246
}
5247
5247
declare <8 x i64 > @llvm.masked.gather.v8i64.v8p0 (<8 x ptr >, i32 , <8 x i1 >, <8 x i64 >)
5248
+
5249
+ ; Test gathers from struct
5250
+ %struct.pt = type { float , float , float , i32 }
5251
+
5252
+ define <16 x float > @test_gather_structpt_16f32_mask_index (ptr %x , ptr %arr , <16 x i1 > %mask , <16 x float > %src0 ) {
5253
+ ; KNL_64-LABEL: test_gather_structpt_16f32_mask_index:
5254
+ ; KNL_64: # %bb.0:
5255
+ ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
5256
+ ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
5257
+ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
5258
+ ; KNL_64-NEXT: vmovdqu64 (%rsi), %zmm0
5259
+ ; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5260
+ ; KNL_64-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5261
+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5262
+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5263
+ ; KNL_64-NEXT: vpsllq $4, %zmm0, %zmm0
5264
+ ; KNL_64-NEXT: vpsllq $4, %zmm2, %zmm2
5265
+ ; KNL_64-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5266
+ ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
5267
+ ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
5268
+ ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
5269
+ ; KNL_64-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5270
+ ; KNL_64-NEXT: retq
5271
+ ;
5272
+ ; KNL_32-LABEL: test_gather_structpt_16f32_mask_index:
5273
+ ; KNL_32: # %bb.0:
5274
+ ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
5275
+ ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
5276
+ ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
5277
+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5278
+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5279
+ ; KNL_32-NEXT: vpslld $4, (%ecx), %zmm0
5280
+ ; KNL_32-NEXT: vgatherdps (%eax,%zmm0), %zmm1 {%k1}
5281
+ ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
5282
+ ; KNL_32-NEXT: retl
5283
+ ;
5284
+ ; SKX_SMALL-LABEL: test_gather_structpt_16f32_mask_index:
5285
+ ; SKX_SMALL: # %bb.0:
5286
+ ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
5287
+ ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
5288
+ ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
5289
+ ; SKX_SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
5290
+ ; SKX_SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5291
+ ; SKX_SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5292
+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5293
+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5294
+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
5295
+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
5296
+ ; SKX_SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5297
+ ; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2
5298
+ ; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
5299
+ ; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
5300
+ ; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5301
+ ; SKX_SMALL-NEXT: retq
5302
+ ;
5303
+ ; SKX_LARGE-LABEL: test_gather_structpt_16f32_mask_index:
5304
+ ; SKX_LARGE: # %bb.0:
5305
+ ; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
5306
+ ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
5307
+ ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
5308
+ ; SKX_LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
5309
+ ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
5310
+ ; SKX_LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
5311
+ ; SKX_LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5312
+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5313
+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5314
+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
5315
+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
5316
+ ; SKX_LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5317
+ ; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2
5318
+ ; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
5319
+ ; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
5320
+ ; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5321
+ ; SKX_LARGE-NEXT: retq
5322
+ ;
5323
+ ; SKX_32-LABEL: test_gather_structpt_16f32_mask_index:
5324
+ ; SKX_32: # %bb.0:
5325
+ ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
5326
+ ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
5327
+ ; SKX_32-NEXT: vpmovd2m %zmm0, %k1
5328
+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5329
+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5330
+ ; SKX_32-NEXT: vpslld $4, (%ecx), %zmm0
5331
+ ; SKX_32-NEXT: vgatherdps (%eax,%zmm0), %zmm1 {%k1}
5332
+ ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
5333
+ ; SKX_32-NEXT: retl
5334
+ %wide.load = load <16 x i32 >, ptr %arr , align 4
5335
+ %and = and <16 x i32 > %wide.load , <i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 >
5336
+ %zext = zext <16 x i32 > %and to <16 x i64 >
5337
+ %ptrs = getelementptr inbounds %struct.pt , ptr %x , <16 x i64 > %zext
5338
+ %res = call <16 x float > @llvm.masked.gather.v16f32.v16p0 (<16 x ptr > %ptrs , i32 4 , <16 x i1 > %mask , <16 x float > %src0 )
5339
+ ret <16 x float > %res
5340
+ }
5341
+
5342
+ define <16 x float > @test_gather_structpt_16f32_mask_index_offset (ptr %x , ptr %arr , <16 x i1 > %mask , <16 x float > %src0 ) {
5343
+ ; KNL_64-LABEL: test_gather_structpt_16f32_mask_index_offset:
5344
+ ; KNL_64: # %bb.0:
5345
+ ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
5346
+ ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
5347
+ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
5348
+ ; KNL_64-NEXT: vmovdqu64 (%rsi), %zmm0
5349
+ ; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5350
+ ; KNL_64-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5351
+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5352
+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5353
+ ; KNL_64-NEXT: vpsllq $4, %zmm0, %zmm0
5354
+ ; KNL_64-NEXT: vpsllq $4, %zmm2, %zmm2
5355
+ ; KNL_64-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5356
+ ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
5357
+ ; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
5358
+ ; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
5359
+ ; KNL_64-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5360
+ ; KNL_64-NEXT: retq
5361
+ ;
5362
+ ; KNL_32-LABEL: test_gather_structpt_16f32_mask_index_offset:
5363
+ ; KNL_32: # %bb.0:
5364
+ ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
5365
+ ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
5366
+ ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
5367
+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5368
+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5369
+ ; KNL_32-NEXT: vpslld $4, (%ecx), %zmm0
5370
+ ; KNL_32-NEXT: vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
5371
+ ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
5372
+ ; KNL_32-NEXT: retl
5373
+ ;
5374
+ ; SKX_SMALL-LABEL: test_gather_structpt_16f32_mask_index_offset:
5375
+ ; SKX_SMALL: # %bb.0:
5376
+ ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
5377
+ ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
5378
+ ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
5379
+ ; SKX_SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
5380
+ ; SKX_SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5381
+ ; SKX_SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5382
+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5383
+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5384
+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
5385
+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
5386
+ ; SKX_SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5387
+ ; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2
5388
+ ; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
5389
+ ; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
5390
+ ; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5391
+ ; SKX_SMALL-NEXT: retq
5392
+ ;
5393
+ ; SKX_LARGE-LABEL: test_gather_structpt_16f32_mask_index_offset:
5394
+ ; SKX_LARGE: # %bb.0:
5395
+ ; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
5396
+ ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
5397
+ ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
5398
+ ; SKX_LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
5399
+ ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
5400
+ ; SKX_LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
5401
+ ; SKX_LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5402
+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5403
+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5404
+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
5405
+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
5406
+ ; SKX_LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5407
+ ; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2
5408
+ ; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
5409
+ ; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
5410
+ ; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5411
+ ; SKX_LARGE-NEXT: retq
5412
+ ;
5413
+ ; SKX_32-LABEL: test_gather_structpt_16f32_mask_index_offset:
5414
+ ; SKX_32: # %bb.0:
5415
+ ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
5416
+ ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
5417
+ ; SKX_32-NEXT: vpmovd2m %zmm0, %k1
5418
+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5419
+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5420
+ ; SKX_32-NEXT: vpslld $4, (%ecx), %zmm0
5421
+ ; SKX_32-NEXT: vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
5422
+ ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
5423
+ ; SKX_32-NEXT: retl
5424
+ %wide.load = load <16 x i32 >, ptr %arr , align 4
5425
+ %and = and <16 x i32 > %wide.load , <i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 >
5426
+ %zext = zext <16 x i32 > %and to <16 x i64 >
5427
+ %ptrs = getelementptr inbounds %struct.pt , ptr %x , <16 x i64 > %zext , i32 1
5428
+ %res = call <16 x float > @llvm.masked.gather.v16f32.v16p0 (<16 x ptr > %ptrs , i32 4 , <16 x i1 > %mask , <16 x float > %src0 )
5429
+ ret <16 x float > %res
5430
+ }
5431
+
5432
+ define {<16 x float >, <16 x float >} @test_gather_16f32_mask_index_pair (ptr %x , ptr %arr , <16 x i1 > %mask , <16 x float > %src0 ) {
5433
+ ; KNL_64-LABEL: test_gather_16f32_mask_index_pair:
5434
+ ; KNL_64: # %bb.0:
5435
+ ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
5436
+ ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
5437
+ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
5438
+ ; KNL_64-NEXT: vmovdqu64 (%rsi), %zmm0
5439
+ ; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5440
+ ; KNL_64-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5441
+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5442
+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5443
+ ; KNL_64-NEXT: vpsllq $4, %zmm0, %zmm3
5444
+ ; KNL_64-NEXT: vpsllq $4, %zmm2, %zmm2
5445
+ ; KNL_64-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5446
+ ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
5447
+ ; KNL_64-NEXT: kmovw %k2, %k3
5448
+ ; KNL_64-NEXT: vmovaps %ymm4, %ymm0
5449
+ ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5450
+ ; KNL_64-NEXT: vmovaps %ymm1, %ymm5
5451
+ ; KNL_64-NEXT: kmovw %k1, %k3
5452
+ ; KNL_64-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5453
+ ; KNL_64-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5454
+ ; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5455
+ ; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5456
+ ; KNL_64-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5457
+ ; KNL_64-NEXT: retq
5458
+ ;
5459
+ ; KNL_32-LABEL: test_gather_16f32_mask_index_pair:
5460
+ ; KNL_32: # %bb.0:
5461
+ ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
5462
+ ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
5463
+ ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
5464
+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5465
+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5466
+ ; KNL_32-NEXT: vpslld $4, (%ecx), %zmm2
5467
+ ; KNL_32-NEXT: vpbroadcastd %eax, %zmm0
5468
+ ; KNL_32-NEXT: vpaddd %zmm2, %zmm0, %zmm3
5469
+ ; KNL_32-NEXT: kmovw %k1, %k2
5470
+ ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
5471
+ ; KNL_32-NEXT: vgatherdps (%eax,%zmm2), %zmm0 {%k2}
5472
+ ; KNL_32-NEXT: vgatherdps 4(,%zmm3), %zmm1 {%k1}
5473
+ ; KNL_32-NEXT: retl
5474
+ ;
5475
+ ; SKX_SMALL-LABEL: test_gather_16f32_mask_index_pair:
5476
+ ; SKX_SMALL: # %bb.0:
5477
+ ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
5478
+ ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
5479
+ ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
5480
+ ; SKX_SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
5481
+ ; SKX_SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5482
+ ; SKX_SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5483
+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5484
+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5485
+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm0, %zmm3
5486
+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
5487
+ ; SKX_SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5488
+ ; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2
5489
+ ; SKX_SMALL-NEXT: kmovw %k2, %k3
5490
+ ; SKX_SMALL-NEXT: vmovaps %ymm4, %ymm0
5491
+ ; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5492
+ ; SKX_SMALL-NEXT: vmovaps %ymm1, %ymm5
5493
+ ; SKX_SMALL-NEXT: kmovw %k1, %k3
5494
+ ; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5495
+ ; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5496
+ ; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5497
+ ; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5498
+ ; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5499
+ ; SKX_SMALL-NEXT: retq
5500
+ ;
5501
+ ; SKX_LARGE-LABEL: test_gather_16f32_mask_index_pair:
5502
+ ; SKX_LARGE: # %bb.0:
5503
+ ; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
5504
+ ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
5505
+ ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
5506
+ ; SKX_LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
5507
+ ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
5508
+ ; SKX_LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
5509
+ ; SKX_LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5510
+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5511
+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5512
+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm0, %zmm3
5513
+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
5514
+ ; SKX_LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5515
+ ; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2
5516
+ ; SKX_LARGE-NEXT: vmovaps %ymm4, %ymm0
5517
+ ; SKX_LARGE-NEXT: kmovw %k2, %k3
5518
+ ; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5519
+ ; SKX_LARGE-NEXT: vmovaps %ymm1, %ymm5
5520
+ ; SKX_LARGE-NEXT: kmovw %k1, %k3
5521
+ ; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5522
+ ; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5523
+ ; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5524
+ ; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5525
+ ; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5526
+ ; SKX_LARGE-NEXT: retq
5527
+ ;
5528
+ ; SKX_32-LABEL: test_gather_16f32_mask_index_pair:
5529
+ ; SKX_32: # %bb.0:
5530
+ ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
5531
+ ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
5532
+ ; SKX_32-NEXT: vpmovd2m %zmm0, %k1
5533
+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5534
+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5535
+ ; SKX_32-NEXT: vpslld $4, (%ecx), %zmm2
5536
+ ; SKX_32-NEXT: vpbroadcastd %eax, %zmm0
5537
+ ; SKX_32-NEXT: vpaddd %zmm2, %zmm0, %zmm3
5538
+ ; SKX_32-NEXT: kmovw %k1, %k2
5539
+ ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
5540
+ ; SKX_32-NEXT: vgatherdps (%eax,%zmm2), %zmm0 {%k2}
5541
+ ; SKX_32-NEXT: vgatherdps 4(,%zmm3), %zmm1 {%k1}
5542
+ ; SKX_32-NEXT: retl
5543
+ %wide.load = load <16 x i32 >, ptr %arr , align 4
5544
+ %and = and <16 x i32 > %wide.load , <i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 >
5545
+ %zext = zext <16 x i32 > %and to <16 x i64 >
5546
+ %ptrs1 = getelementptr inbounds %struct.pt , ptr %x , <16 x i64 > %zext
5547
+ %res1 = call <16 x float > @llvm.masked.gather.v16f32.v16p0 (<16 x ptr > %ptrs1 , i32 4 , <16 x i1 > %mask , <16 x float > %src0 )
5548
+ %ptrs = getelementptr inbounds %struct.pt , ptr %x , <16 x i64 > %zext , i32 1
5549
+ %res = call <16 x float > @llvm.masked.gather.v16f32.v16p0 (<16 x ptr > %ptrs , i32 4 , <16 x i1 > %mask , <16 x float > %src0 )
5550
+ %pair1 = insertvalue {<16 x float >, <16 x float >} undef , <16 x float > %res1 , 0
5551
+ %pair2 = insertvalue {<16 x float >, <16 x float >} %pair1 , <16 x float > %res , 1
5552
+ ret {<16 x float >, <16 x float >} %pair2
5553
+ }
0 commit comments