Skip to content

Commit 87aa1c1

Browse files
committed
[X86] X86FixupVectorConstantsPass - use scheduler model to avoid regressions
When attempting to replace a full vector constant load with an instruction that uses a smaller constant, check the scheduler model to ensure the instruction isn't slower. WIP - this currently ignores a couple of factors, including: - optsize/minsize functions - what trade-off in throughput/latency/codesize is acceptable for smaller datasize - identifying hoisted constant loads where the slower instruction might be acceptable Fixes #135998
1 parent 156985e commit 87aa1c1

30 files changed

+317
-178
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,13 @@ static Constant *rebuildZExtCst(const Constant *C, unsigned NumBits,
336336
return rebuildExtCst(C, false, NumBits, NumElts, SrcEltBitWidth);
337337
}
338338

339+
template <typename T>
340+
static std::optional<bool> CmpOptionals(T NewVal, T CurVal) {
341+
if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal)
342+
return *NewVal < *CurVal;
343+
return std::nullopt;
344+
}
345+
339346
bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
340347
MachineBasicBlock &MBB,
341348
MachineInstr &MI) {
@@ -356,6 +363,51 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
356363
std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
357364
RebuildConstant;
358365
};
366+
367+
auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
368+
// We already checked that SchedModel exists in `NewOpcPreferable`.
369+
return MCSchedModel::getReciprocalThroughput(
370+
*ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
371+
};
372+
auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> {
373+
// We already checked that SchedModel exists in `NewOpcPreferable`.
374+
return MCSchedModel::computeInstrLatency(
375+
*ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
376+
};
377+
auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> {
378+
if (unsigned Size = TII->get(Opcode).getSize())
379+
return Size;
380+
// Zero size means we where unable to compute it.
381+
return std::nullopt;
382+
};
383+
384+
auto NewOpcPreferable = [&](const FixupEntry &Fixup) -> bool {
385+
unsigned NewOpc = Fixup.Op;
386+
387+
std::optional<bool> Res;
388+
if (SM->hasInstrSchedModel()) {
389+
// Compare tput -> lat -> code size.
390+
// TODO: how much increase in tput/latency/size should we permit for the
391+
// reduction in constant pool size?
392+
Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc));
393+
if (Res.has_value())
394+
return *Res;
395+
396+
Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc));
397+
if (Res.has_value())
398+
return *Res;
399+
}
400+
401+
// TODO: Include data reduction in size comparison?
402+
Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc));
403+
if (Res.has_value())
404+
return *Res;
405+
406+
// We either were unable to get tput/lat/codesize or all values were equal.
407+
// Prefer the new opcode for reduced constant pool size.
408+
return true;
409+
};
410+
359411
auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned RegBitWidth,
360412
unsigned OperandNo) {
361413
#ifdef EXPENSIVE_CHECKS
@@ -372,7 +424,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
372424
unsigned CstBitWidth = C->getType()->getPrimitiveSizeInBits();
373425
RegBitWidth = RegBitWidth ? RegBitWidth : CstBitWidth;
374426
for (const FixupEntry &Fixup : Fixups) {
375-
if (Fixup.Op) {
427+
if (Fixup.Op && NewOpcPreferable(Fixup)) {
376428
// Construct a suitable constant and adjust the MI to use the new
377429
// constant pool entry.
378430
if (Constant *NewCst = Fixup.RebuildConstant(

llvm/test/CodeGen/X86/avgceils.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
3939
;
4040
; AVX512-LABEL: test_fixed_v16i8:
4141
; AVX512: # %bb.0:
42-
; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
42+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
4343
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
4444
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
4545
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -82,7 +82,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
8282
;
8383
; AVX512-LABEL: test_ext_v16i8:
8484
; AVX512: # %bb.0:
85-
; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
85+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
8686
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
8787
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
8888
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -365,7 +365,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
365365
;
366366
; AVX512-LABEL: test_fixed_v32i8:
367367
; AVX512: # %bb.0:
368-
; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
368+
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
369369
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
370370
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
371371
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -416,7 +416,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
416416
;
417417
; AVX512-LABEL: test_ext_v32i8:
418418
; AVX512: # %bb.0:
419-
; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
419+
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
420420
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
421421
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
422422
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -875,7 +875,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
875875
;
876876
; AVX512-LABEL: test_fixed_v64i8:
877877
; AVX512: # %bb.0:
878-
; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
878+
; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
879879
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
880880
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
881881
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0
@@ -946,7 +946,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
946946
;
947947
; AVX512-LABEL: test_ext_v64i8:
948948
; AVX512: # %bb.0:
949-
; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
949+
; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
950950
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
951951
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
952952
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0

llvm/test/CodeGen/X86/avgfloors.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
5252
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
5353
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
5454
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
55-
; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
55+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
5656
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
5757
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
5858
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -107,7 +107,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
107107
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
108108
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
109109
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
110-
; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
110+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
111111
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
112112
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
113113
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -404,7 +404,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
404404
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
405405
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
406406
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
407-
; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
407+
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
408408
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
409409
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
410410
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -477,7 +477,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
477477
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
478478
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
479479
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
480-
; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
480+
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
481481
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
482482
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
483483
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -965,7 +965,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
965965
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
966966
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
967967
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
968-
; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
968+
; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
969969
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
970970
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
971971
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
@@ -1077,7 +1077,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
10771077
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
10781078
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
10791079
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
1080-
; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
1080+
; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
10811081
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
10821082
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
10831083
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0

llvm/test/CodeGen/X86/avx512-build-vector.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ define <16 x float> @test3(<4 x float> %a) {
1515
; CHECK-LABEL: test3:
1616
; CHECK: ## %bb.0:
1717
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
18-
; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
18+
; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
1919
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
2020
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
2121
; CHECK-NEXT: vmovaps %zmm1, %zmm0

llvm/test/CodeGen/X86/combine-or-shuffle.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,7 @@ define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
836836
;
837837
; AVX512-LABEL: or_and_v4i32:
838838
; AVX512: # %bb.0:
839-
; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
839+
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,15,7]
840840
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
841841
; AVX512-NEXT: retq
842842
%1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>

llvm/test/CodeGen/X86/combine-or.ll

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,11 @@ define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
2929
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295]
3030
; SSE-NEXT: retq
3131
;
32-
; AVX1-LABEL: or_zext_v2i32:
33-
; AVX1: # %bb.0:
34-
; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
35-
; AVX1-NEXT: retq
36-
;
37-
; AVX2-LABEL: or_zext_v2i32:
38-
; AVX2: # %bb.0:
39-
; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
40-
; AVX2-NEXT: # xmm0 = mem[0,0]
41-
; AVX2-NEXT: retq
32+
; AVX-LABEL: or_zext_v2i32:
33+
; AVX: # %bb.0:
34+
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
35+
; AVX-NEXT: # xmm0 = mem[0,0]
36+
; AVX-NEXT: retq
4237
%1 = zext <2 x i32> %a0 to <2 x i64>
4338
%2 = or <2 x i64> %1, <i64 4294967295, i64 4294967295>
4439
ret <2 x i64> %2
@@ -261,7 +256,7 @@ define i64 @PR89533(<64 x i8> %a0) {
261256
;
262257
; AVX2-LABEL: PR89533:
263258
; AVX2: # %bb.0:
264-
; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
259+
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
265260
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
266261
; AVX2-NEXT: vpmovmskb %ymm0, %eax
267262
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm0

llvm/test/CodeGen/X86/constant-pool-sharing.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
105105
;
106106
; AVX-LINUX-LABEL: store_repeated_constants:
107107
; AVX-LINUX: # %bb.0:
108-
; AVX-LINUX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
109-
; AVX-LINUX-NEXT: # ymm0 = mem[0,1,0,1]
108+
; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
110109
; AVX-LINUX-NEXT: vmovaps %ymm0, (%rdi)
111110
; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
112111
; AVX-LINUX-NEXT: vmovaps %xmm0, %xmm1
@@ -119,8 +118,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
119118
;
120119
; AVX-MSVC-LABEL: store_repeated_constants:
121120
; AVX-MSVC: # %bb.0:
122-
; AVX-MSVC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
123-
; AVX-MSVC-NEXT: # ymm0 = mem[0,1,0,1]
121+
; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
124122
; AVX-MSVC-NEXT: vmovaps %ymm0, (%rcx)
125123
; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
126124
; AVX-MSVC-NEXT: vmovaps %xmm0, %xmm1

llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
389389
;
390390
; CHECK-FMA-LABEL: fmul_pow2_8xhalf:
391391
; CHECK-FMA: # %bb.0:
392-
; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
392+
; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
393393
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
394394
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
395395
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -649,12 +649,26 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
649649
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
650650
; CHECK-SSE-NEXT: retq
651651
;
652-
; CHECK-AVX-LABEL: fdiv_pow2_8xhalf:
653-
; CHECK-AVX: # %bb.0:
654-
; CHECK-AVX-NEXT: vpsllw $10, %xmm0, %xmm0
655-
; CHECK-AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
656-
; CHECK-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0
657-
; CHECK-AVX-NEXT: retq
652+
; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf:
653+
; CHECK-AVX2: # %bb.0:
654+
; CHECK-AVX2-NEXT: vpsllw $10, %xmm0, %xmm0
655+
; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
656+
; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
657+
; CHECK-AVX2-NEXT: retq
658+
;
659+
; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf:
660+
; CHECK-NO-FASTFMA: # %bb.0:
661+
; CHECK-NO-FASTFMA-NEXT: vpsllw $10, %xmm0, %xmm0
662+
; CHECK-NO-FASTFMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
663+
; CHECK-NO-FASTFMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
664+
; CHECK-NO-FASTFMA-NEXT: retq
665+
;
666+
; CHECK-FMA-LABEL: fdiv_pow2_8xhalf:
667+
; CHECK-FMA: # %bb.0:
668+
; CHECK-FMA-NEXT: vpsllw $10, %xmm0, %xmm0
669+
; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
670+
; CHECK-FMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
671+
; CHECK-FMA-NEXT: retq
658672
%p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
659673
%p2_f = uitofp <8 x i16> %p2 to <8 x half>
660674
%r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
@@ -1135,7 +1149,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
11351149
;
11361150
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
11371151
; CHECK-FMA: # %bb.0:
1138-
; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
1152+
; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
11391153
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
11401154
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
11411155
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0

0 commit comments

Comments
 (0)