-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[X86] X86FixupVectorConstantsPass - use scheduler model to avoid regressions #140028
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
87aa1c1
to
bb44378
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
a59ed00
to
cd49d54
Compare
…essions When attempting to replace a full vector constant load with an instruction that uses a smaller constant, check the scheduler model to ensure the instruction isn't slower. WIP - this currently ignores a couple of factors, including: - what trade-off in throughput/latency/codesize is acceptable for smaller datasize - identifying hoisted constant loads where the slower instruction might be acceptable Fixes llvm#135998
… always be less than the savings in the constant pool (8-bytes or more)
Allow a 1 cycle latency regression per 128-bits of constant data saved (very basic rule of thumb).
cd49d54
to
60dc4a9
Compare
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesWhen attempting to replace a full vector constant load with an instruction that uses a smaller constant, check the scheduler model to ensure the instruction isn't slower. Throughput must not regress, but allow a small increase in latency based on how much constant data we're saving (I've used a simple estimate of 1 cycle per 128-bits of data saved). NOTE: this currently ignores hoisted constant loads where the slower instruction might be acceptable. Fixes #135998 Patch is 75.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140028.diff 30 Files Affected:
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index a415a45775984..5d3b49b733615 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -348,6 +348,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
bool HasBWI = ST->hasBWI();
bool HasVLX = ST->hasVLX();
bool MultiDomain = ST->hasAVX512() || ST->hasNoDomainDelayMov();
+ bool OptSize = MF.getFunction().hasOptSize();
struct FixupEntry {
int Op;
@@ -356,6 +357,36 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
RebuildConstant;
};
+
+ auto NewOpcPreferable = [&](const FixupEntry &Fixup,
+ unsigned RegBitWidth) -> bool {
+ if (SM->hasInstrSchedModel()) {
+ unsigned NewOpc = Fixup.Op;
+ auto *OldDesc = SM->getSchedClassDesc(TII->get(Opc).getSchedClass());
+ auto *NewDesc = SM->getSchedClassDesc(TII->get(NewOpc).getSchedClass());
+ unsigned BitsSaved = RegBitWidth - (Fixup.NumCstElts * Fixup.MemBitWidth);
+
+ // Compare tput/lat - avoid any regressions, but allow extra cycle of
+ // latency in exchange for each 128-bit (or less) constant pool reduction
+ // (this is a very simple cost:benefit estimate - there will probably be
+ // better ways to calculate this).
+ double OldTput = MCSchedModel::getReciprocalThroughput(*ST, *OldDesc);
+ double NewTput = MCSchedModel::getReciprocalThroughput(*ST, *NewDesc);
+ if (OldTput != NewTput)
+ return NewTput < OldTput;
+
+ int LatTol = (BitsSaved + 127) / 128;
+ int OldLat = MCSchedModel::computeInstrLatency(*ST, *OldDesc);
+ int NewLat = MCSchedModel::computeInstrLatency(*ST, *NewDesc);
+ if (OldLat != NewLat)
+ return NewLat < (OldLat + LatTol);
+ }
+
+ // We either were unable to get tput/lat or all values were equal.
+ // Prefer the new opcode for reduced constant pool size.
+ return true;
+ };
+
auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned RegBitWidth,
unsigned OperandNo) {
#ifdef EXPENSIVE_CHECKS
@@ -372,7 +403,11 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
unsigned CstBitWidth = C->getType()->getPrimitiveSizeInBits();
RegBitWidth = RegBitWidth ? RegBitWidth : CstBitWidth;
for (const FixupEntry &Fixup : Fixups) {
- if (Fixup.Op) {
+ // Always uses the smallest possible constant load with opt/minsize,
+ // otherwise use the smallest instruction that doesn't affect
+ // performance.
+ // TODO: If constant has been hoisted from loop, use smallest constant.
+ if (Fixup.Op && (OptSize || NewOpcPreferable(Fixup, RegBitWidth))) {
// Construct a suitable constant and adjust the MI to use the new
// constant pool entry.
if (Constant *NewCst = Fixup.RebuildConstant(
diff --git a/llvm/test/CodeGen/X86/avgceils.ll b/llvm/test/CodeGen/X86/avgceils.ll
index bb6f3fc9f588a..70b2d4be2fd81 100644
--- a/llvm/test/CodeGen/X86/avgceils.ll
+++ b/llvm/test/CodeGen/X86/avgceils.ll
@@ -39,7 +39,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -82,7 +82,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -365,7 +365,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -416,7 +416,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -875,7 +875,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v64i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0
@@ -946,7 +946,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v64i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
index 2566860357130..0508e5ccb5430 100644
--- a/llvm/test/CodeGen/X86/avgfloors.ll
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -52,7 +52,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -107,7 +107,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -404,7 +404,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -477,7 +477,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -965,7 +965,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
@@ -1077,7 +1077,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll
index b21a0c4e36c2b..55478a2e93154 100644
--- a/llvm/test/CodeGen/X86/avx512-build-vector.ll
+++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll
@@ -15,7 +15,7 @@ define <16 x float> @test3(<4 x float> %a) {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 55b1cdeddb853..95b5fcf8eac52 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -836,7 +836,7 @@ define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
;
; AVX512-LABEL: or_and_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,15,7]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
; AVX512-NEXT: retq
%1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index 08262e4d34b26..8c91274abf3dd 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -29,16 +29,11 @@ define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295]
; SSE-NEXT: retq
;
-; AVX1-LABEL: or_zext_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: or_zext_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
-; AVX2-NEXT: # xmm0 = mem[0,0]
-; AVX2-NEXT: retq
+; AVX-LABEL: or_zext_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
+; AVX-NEXT: # xmm0 = mem[0,0]
+; AVX-NEXT: retq
%1 = zext <2 x i32> %a0 to <2 x i64>
%2 = or <2 x i64> %1, <i64 4294967295, i64 4294967295>
ret <2 x i64> %2
@@ -261,7 +256,7 @@ define i64 @PR89533(<64 x i8> %a0) {
;
; AVX2-LABEL: PR89533:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/constant-pool-sharing.ll b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
index 9e0114c017419..be835f78292e5 100644
--- a/llvm/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
@@ -105,8 +105,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
;
; AVX-LINUX-LABEL: store_repeated_constants:
; AVX-LINUX: # %bb.0:
-; AVX-LINUX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
-; AVX-LINUX-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
; AVX-LINUX-NEXT: vmovaps %ymm0, (%rdi)
; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
; AVX-LINUX-NEXT: vmovaps %xmm0, %xmm1
@@ -119,8 +118,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
;
; AVX-MSVC-LABEL: store_repeated_constants:
; AVX-MSVC: # %bb.0:
-; AVX-MSVC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
-; AVX-MSVC-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
; AVX-MSVC-NEXT: vmovaps %ymm0, (%rcx)
; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
; AVX-MSVC-NEXT: vmovaps %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 59a61722927de..5519d9b787b7f 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -389,7 +389,7 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
;
; CHECK-FMA-LABEL: fmul_pow2_8xhalf:
; CHECK-FMA: # %bb.0:
-; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -649,12 +649,26 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX-LABEL: fdiv_pow2_8xhalf:
-; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpsllw $10, %xmm0, %xmm0
-; CHECK-AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
-; CHECK-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0
-; CHECK-AVX-NEXT: retq
+; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf:
+; CHECK-NO-FASTFMA: # %bb.0:
+; CHECK-NO-FASTFMA-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-NO-FASTFMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT: retq
+;
+; CHECK-FMA-LABEL: fdiv_pow2_8xhalf:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-FMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
%p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
%p2_f = uitofp <8 x i16> %p2 to <8 x half>
%r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
@@ -1135,7 +1149,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
;
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-FMA: # %bb.0:
-; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 4ac829c0dfd53..1a2cfd69650b8 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -198,7 +198,7 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) nounwind {
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -576,7 +576,8 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) nounwind {
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
@@ -1023,7 +1024,8 @@ define <4 x i32> @ustest_f...
[truncated]
|
When attempting to replace a full vector constant load with an instruction that uses a smaller constant, check the scheduler model to ensure the instruction isn't slower.
Throughput must not regress, but allow a small increase in latency based on how much constant data we're saving (I've used a simple estimate of 1 cycle per 128-bits of data saved).
NOTE: this currently ignores hoisted constant loads where the slower instruction might be acceptable.
Fixes #135998