-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[X86] X86FixupVectorConstantsPass - use VPMOVSX/ZX extensions for PS/PD domain moves #122601
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…PD domain moves For targets with free domain moves, or AVX512 support, allow the use of VPMOVSX/ZX extension loads to reduce the load sizes. I've limited this to extension to i32/i64 types as we're mostly interested in shuffle mask loading here, but we could include i16 types as well just as easily.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesFor targets with free domain moves, or AVX512 support, allow the use of VPMOVSX/ZX extension loads to reduce the load sizes. I've limited this to extension to i32/i64 types as we're mostly interested in shuffle mask loading here, but we could include i16 types as well just as easily. Inspired by a regression on #122485 Patch is 393.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122601.diff 56 Files Affected:
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 68a4a0be3a1db7..7390cc58054528 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -338,6 +338,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
bool HasDQI = ST->hasDQI();
bool HasBWI = ST->hasBWI();
bool HasVLX = ST->hasVLX();
+ bool MultiDomain = ST->hasAVX512() || ST->hasNoDomainDelayMov();
struct FixupEntry {
int Op;
@@ -401,47 +402,107 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
case X86::VMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPDrm:
- case X86::VMOVUPSrm:
- return FixupConstant({{X86::VMOVSSrm, 1, 32, rebuildZeroUpperCst},
- {X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst},
- {X86::VMOVSDrm, 1, 64, rebuildZeroUpperCst},
- {X86::VMOVDDUPrm, 1, 64, rebuildSplatCst}},
- 128, 1);
+ case X86::VMOVUPSrm: {
+ FixupEntry Fixups[] = {
+ {MultiDomain ? X86::VPMOVSXBQrm : 0, 2, 8, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXBQrm : 0, 2, 8, rebuildZExtCst},
+ {X86::VMOVSSrm, 1, 32, rebuildZeroUpperCst},
+ {X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst},
+ {MultiDomain ? X86::VPMOVSXBDrm : 0, 4, 8, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXBDrm : 0, 4, 8, rebuildZExtCst},
+ {MultiDomain ? X86::VPMOVSXWQrm : 0, 2, 16, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXWQrm : 0, 2, 16, rebuildZExtCst},
+ {X86::VMOVSDrm, 1, 64, rebuildZeroUpperCst},
+ {X86::VMOVDDUPrm, 1, 64, rebuildSplatCst},
+ {MultiDomain ? X86::VPMOVSXWDrm : 0, 4, 16, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXWDrm : 0, 4, 16, rebuildZExtCst},
+ {MultiDomain ? X86::VPMOVSXDQrm : 0, 2, 32, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXDQrm : 0, 2, 32, rebuildZExtCst}};
+ return FixupConstant(Fixups, 128, 1);
+ }
case X86::VMOVAPDYrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPDYrm:
- case X86::VMOVUPSYrm:
- return FixupConstant({{X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst},
- {X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst},
- {X86::VBROADCASTF128rm, 1, 128, rebuildSplatCst}},
- 256, 1);
+ case X86::VMOVUPSYrm: {
+ FixupEntry Fixups[] = {
+ {X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst},
+ {HasAVX2 && MultiDomain ? X86::VPMOVSXBQYrm : 0, 4, 8, rebuildSExtCst},
+ {HasAVX2 && MultiDomain ? X86::VPMOVZXBQYrm : 0, 4, 8, rebuildZExtCst},
+ {X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst},
+ {HasAVX2 && MultiDomain ? X86::VPMOVSXBDYrm : 0, 8, 8, rebuildSExtCst},
+ {HasAVX2 && MultiDomain ? X86::VPMOVZXBDYrm : 0, 8, 8, rebuildZExtCst},
+ {HasAVX2 && MultiDomain ? X86::VPMOVSXWQYrm : 0, 4, 16, rebuildSExtCst},
+ {HasAVX2 && MultiDomain ? X86::VPMOVZXWQYrm : 0, 4, 16, rebuildZExtCst},
+ {X86::VBROADCASTF128rm, 1, 128, rebuildSplatCst},
+ {HasAVX2 && MultiDomain ? X86::VPMOVSXWDYrm : 0, 8, 16, rebuildSExtCst},
+ {HasAVX2 && MultiDomain ? X86::VPMOVZXWDYrm : 0, 8, 16, rebuildZExtCst},
+ {HasAVX2 && MultiDomain ? X86::VPMOVSXDQYrm : 0, 4, 32, rebuildSExtCst},
+ {HasAVX2 && MultiDomain ? X86::VPMOVZXDQYrm : 0, 4, 32,
+ rebuildZExtCst}};
+ return FixupConstant(Fixups, 256, 1);
+ }
case X86::VMOVAPDZ128rm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPDZ128rm:
- case X86::VMOVUPSZ128rm:
- return FixupConstant({{X86::VMOVSSZrm, 1, 32, rebuildZeroUpperCst},
- {X86::VBROADCASTSSZ128rm, 1, 32, rebuildSplatCst},
- {X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
- {X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}},
- 128, 1);
+ case X86::VMOVUPSZ128rm: {
+ FixupEntry Fixups[] = {
+ {MultiDomain ? X86::VPMOVSXBQZ128rm : 0, 2, 8, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXBQZ128rm : 0, 2, 8, rebuildZExtCst},
+ {X86::VMOVSSZrm, 1, 32, rebuildZeroUpperCst},
+ {X86::VBROADCASTSSZ128rm, 1, 32, rebuildSplatCst},
+ {MultiDomain ? X86::VPMOVSXBDZ128rm : 0, 4, 8, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXBDZ128rm : 0, 4, 8, rebuildZExtCst},
+ {MultiDomain ? X86::VPMOVSXWQZ128rm : 0, 2, 16, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXWQZ128rm : 0, 2, 16, rebuildZExtCst},
+ {X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
+ {X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst},
+ {MultiDomain ? X86::VPMOVSXWDZ128rm : 0, 4, 16, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXWDZ128rm : 0, 4, 16, rebuildZExtCst},
+ {MultiDomain ? X86::VPMOVSXDQZ128rm : 0, 2, 32, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXDQZ128rm : 0, 2, 32, rebuildZExtCst}};
+ return FixupConstant(Fixups, 128, 1);
+ }
case X86::VMOVAPDZ256rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPDZ256rm:
- case X86::VMOVUPSZ256rm:
- return FixupConstant(
- {{X86::VBROADCASTSSZ256rm, 1, 32, rebuildSplatCst},
- {X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
- {X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}},
- 256, 1);
+ case X86::VMOVUPSZ256rm: {
+ FixupEntry Fixups[] = {
+ {X86::VBROADCASTSSZ256rm, 1, 32, rebuildSplatCst},
+ {MultiDomain ? X86::VPMOVSXBQZ256rm : 0, 4, 8, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXBQZ256rm : 0, 4, 8, rebuildZExtCst},
+ {X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
+ {MultiDomain ? X86::VPMOVSXBDZ256rm : 0, 8, 8, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXBDZ256rm : 0, 8, 8, rebuildZExtCst},
+ {MultiDomain ? X86::VPMOVSXWQZ256rm : 0, 4, 16, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXWQZ256rm : 0, 4, 16, rebuildZExtCst},
+ {X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst},
+ {MultiDomain ? X86::VPMOVSXWDZ256rm : 0, 8, 16, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXWDZ256rm : 0, 8, 16, rebuildZExtCst},
+ {MultiDomain ? X86::VPMOVSXDQZ256rm : 0, 4, 32, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXDQZ256rm : 0, 4, 32, rebuildZExtCst}};
+ return FixupConstant(Fixups, 256, 1);
+ }
case X86::VMOVAPDZrm:
case X86::VMOVAPSZrm:
case X86::VMOVUPDZrm:
- case X86::VMOVUPSZrm:
- return FixupConstant({{X86::VBROADCASTSSZrm, 1, 32, rebuildSplatCst},
- {X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst},
- {X86::VBROADCASTF32X4Zrm, 1, 128, rebuildSplatCst},
- {X86::VBROADCASTF64X4Zrm, 1, 256, rebuildSplatCst}},
- 512, 1);
+ case X86::VMOVUPSZrm: {
+ FixupEntry Fixups[] = {
+ {X86::VBROADCASTSSZrm, 1, 32, rebuildSplatCst},
+ {X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst},
+ {MultiDomain ? X86::VPMOVSXBQZrm : 0, 8, 8, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXBQZrm : 0, 8, 8, rebuildZExtCst},
+ {X86::VBROADCASTF32X4Zrm, 1, 128, rebuildSplatCst},
+ {MultiDomain ? X86::VPMOVSXBDZrm : 0, 16, 8, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXBDZrm : 0, 16, 8, rebuildZExtCst},
+ {MultiDomain ? X86::VPMOVSXWQZrm : 0, 8, 16, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXWQZrm : 0, 8, 16, rebuildZExtCst},
+ {X86::VBROADCASTF64X4Zrm, 1, 256, rebuildSplatCst},
+ {MultiDomain ? X86::VPMOVSXWDZrm : 0, 16, 16, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXWDZrm : 0, 16, 16, rebuildZExtCst},
+ {MultiDomain ? X86::VPMOVSXDQZrm : 0, 8, 32, rebuildSExtCst},
+ {MultiDomain ? X86::VPMOVZXDQZrm : 0, 8, 32, rebuildZExtCst}};
+ return FixupConstant(Fixups, 512, 1);
+ }
/* Integer Loads */
case X86::MOVDQArm:
case X86::MOVDQUrm: {
diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll
index 55478a2e93154b..b21a0c4e36c2bd 100644
--- a/llvm/test/CodeGen/X86/avx512-build-vector.ll
+++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll
@@ -15,7 +15,7 @@ define <16 x float> @test3(<4 x float> %a) {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 8d98290ba29a6a..8aa898f3ec5765 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1060,12 +1060,12 @@ define i32 @test13_crash(i32 %x, i32 %y) {
define <4 x i1> @test14() {
; CHECK-LABEL: test14:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,1,0,1]
; CHECK-NEXT: retq
;
; X86-LABEL: test14:
; X86: ## %bb.0:
-; X86-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1]
+; X86-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,1,0,1]
; X86-NEXT: retl
%a = bitcast i16 21845 to <16 x i1>
%b = extractelement <16 x i1> %a, i32 2
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 5078130f180779..5d901a8a380a9c 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -903,7 +903,7 @@ define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) {
define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,0,3,2]
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
@@ -1001,7 +1001,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i
define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,3,2,5]
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
@@ -1189,7 +1189,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32>
define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6]
; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
@@ -1283,7 +1283,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x
define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
@@ -1321,7 +1321,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x
define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
@@ -1424,7 +1424,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x
define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,0,0,13]
; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
@@ -1465,7 +1465,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x
define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <16 x i32>, ptr %vp
@@ -1768,7 +1768,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32
define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10]
+; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [12,9,4,10]
; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-FAST-NEXT: vzeroupper
@@ -2050,7 +2050,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i
define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask3:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [6,0,0,7]
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [6,0,0,7]
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-FAST-NEXT: retq
@@ -2185,7 +2185,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i
define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,3]
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,6,5,3]
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-FAST-NEXT: retq
@@ -2711,7 +2711,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64>
define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [4,1]
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,1]
; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0
; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-FAST-NEXT: vzeroupper
@@ -2847,7 +2847,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec,
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,3,5,0]
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
@@ -2863,7 +2863,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec,
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,0]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
@@ -2879,7 +2879,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec,
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,2,7,0]
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
@@ -2895,7 +2895,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec,
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,2,7,0]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
@@ -2910,7 +2910,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec
define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,5,2]
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
@@ -2922,7 +2922,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec,
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,3,5,2]
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
@@ -2938,7 +2938,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec,
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,5,2]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
@@ -2954,7 +2954,7 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,6,0,1]
; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, ptr %vp
@@ -2965,7 +2965,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,0,1]
; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
@@ -2982,7 +2982,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1]
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,6,0,1]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
@@ -2999,7 +2999,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_pe...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/81/builds/3566 Here is the relevant piece of the build log for the reference
|
For targets with free domain moves, or AVX512 support, allow the use of VPMOVSX/ZX extension loads to reduce the load sizes.
I've limited this to extension to i32/i64 types as we're mostly interested in shuffle mask loading here, but we could include i16 types as well just as easily.
Inspired by a regression on #122485