-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU][RegBankCombiner] Add cast_of_cast and constant_fold_cast combines #131307
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This was referenced Mar 14, 2025
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesWe can add a bunch of exts/truncs during RBSelect, we should be able to fold Patch is 184.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131307.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 36653867fbba0..a21505356274b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -180,5 +180,6 @@ def AMDGPURegBankCombiner : GICombiner<
[unmerge_merge, unmerge_cst, unmerge_undef,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
- identity_combines, redundant_and]> {
+ identity_combines, redundant_and, constant_fold_cast_op,
+ cast_of_cast_combines]> {
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 3a52497bd6e91..07fcb02d98649 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -41,10 +41,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_and_b32 s2, s2, 0x7f
; GFX8-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshr_b32 s1, s1, 1
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
@@ -72,10 +71,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_and_b32 s2, s2, 0x7f
; GFX9-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
@@ -102,9 +100,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX10-NEXT: s_and_b32 s2, s2, 0x7f
; GFX10-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_mul_lo_u32 v1, v0, -7
@@ -134,9 +131,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX11-NEXT: s_and_b32 s2, s2, 0x7f
; GFX11-NEXT: s_and_b32 s1, s1, 0x7f
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -351,11 +347,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_and_b32 s3, s2, 7
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s2, 7, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -365,11 +358,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_and_b32 s3, s2, 7
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_andn2_b32 s2, 7, s2
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -379,11 +369,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_and_b32 s3, s2, 7
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_andn2_b32 s2, 7, s2
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -393,11 +380,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_and_b32 s3, s2, 7
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
; GFX11-NEXT: s_lshl_b32 s0, s0, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -489,7 +473,6 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX8-LABEL: s_fshl_i8_4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
; GFX8-NEXT: s_lshr_b32 s1, s1, 4
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -498,7 +481,6 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-LABEL: s_fshl_i8_4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, 4
; GFX9-NEXT: s_lshr_b32 s1, s1, 4
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -508,7 +490,6 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshr_b32 s1, s1, 4
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
@@ -517,9 +498,8 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s1, s1, 4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4)
@@ -586,7 +566,6 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX8-LABEL: s_fshl_i8_5:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, 5
; GFX8-NEXT: s_lshr_b32 s1, s1, 3
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -595,7 +574,6 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-LABEL: s_fshl_i8_5:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, 5
; GFX9-NEXT: s_lshr_b32 s1, s1, 3
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -605,7 +583,6 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshl_b32 s0, s0, 5
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshr_b32 s1, s1, 3
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
@@ -614,9 +591,8 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s1, s1, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5)
@@ -702,23 +678,17 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_lshr_b32 s5, s2, 8
; GFX8-NEXT: s_and_b32 s6, s2, 7
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s2, 7, s2
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshl_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s5, 7
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s2, s4, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_andn2_b32 s3, 7, s5
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_andn2_b32 s3, 7, s5
; GFX8-NEXT: s_lshr_b32 s2, s2, s3
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
@@ -733,23 +703,17 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_lshr_b32 s5, s2, 8
; GFX9-NEXT: s_and_b32 s6, s2, 7
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_andn2_b32 s2, 7, s2
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s6
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s5, 7
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_and_b32 s2, s4, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s3, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_andn2_b32 s3, 7, s5
; GFX9-NEXT: s_lshr_b32 s2, s2, 1
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_andn2_b32 s3, 7, s5
; GFX9-NEXT: s_lshr_b32 s2, s2, s3
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
@@ -761,25 +725,19 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX10-LABEL: s_fshl_v2i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s4, s1, 8
-; GFX10-NEXT: s_and_b32 s5, s2, 7
-; GFX10-NEXT: s_lshr_b32 s6, s2, 8
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_lshr_b32 s5, s2, 8
+; GFX10-NEXT: s_and_b32 s6, s2, 7
; GFX10-NEXT: s_and_b32 s4, s4, 0xff
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, s5
-; GFX10-NEXT: s_and_b32 s5, s6, 7
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT: s_andn2_b32 s6, 7, s6
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_lshl_b32 s0, s0, s6
+; GFX10-NEXT: s_and_b32 s6, s5, 7
; GFX10-NEXT: s_lshr_b32 s4, s4, 1
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_andn2_b32 s5, 7, s5
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_lshl_b32 s3, s3, s5
-; GFX10-NEXT: s_lshr_b32 s4, s4, s6
+; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_lshl_b32 s3, s3, s6
+; GFX10-NEXT: s_lshr_b32 s4, s4, s5
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s2, s3, s4
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -792,25 +750,19 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX11-LABEL: s_fshl_v2i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshr_b32 s4, s1, 8
-; GFX11-NEXT: s_and_b32 s5, s2, 7
-; GFX11-NEXT: s_lshr_b32 s6, s2, 8
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-NEXT: s_and_b32 s6, s2, 7
; GFX11-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshl_b32 s0, s0, s5
-; GFX11-NEXT: s_and_b32 s5, s6, 7
-; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT: s_and_not1_b32 s6, 7, s6
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_lshl_b32 s0, s0, s6
+; GFX11-NEXT: s_and_b32 s6, s5, 7
; GFX11-NEXT: s_lshr_b32 s4, s4, 1
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_and_not1_b32 s5, 7, s5
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_lshl_b32 s3, s3, s5
-; GFX11-NEXT: s_lshr_b32 s4, s4, s6
+; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_lshl_b32 s3, s3, s6
+; GFX11-NEXT: s_lshr_b32 s4, s4, s5
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_or_b32 s2, s3, s4
; GFX11-NEXT: s_or_b32 s0, s0, s1
@@ -1030,11 +982,8 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_lshr_b32 s11, s2, 24
; GFX8-NEXT: s_and_b32 s12, s2, 7
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
-; GFX8-NEXT: s_and_b32 s12, 0xffff, s12
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s2, 7, s2
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_lshr_b32 s5, s0, 24
@@ -1042,29 +991,24 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s9, 7
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s2, s6, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_andn2_b32 s3, 7, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_andn2_b32 s3, 7, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, s3
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s2, s10, 7
-; GFX8-NEXT: s_lshl_b32 s2, s4, s2
; GFX8-NEXT: s_and_b32 s3, s7, 0xff
-; GFX8-NEXT: s_andn2_b32 s4, 7, s10
+; GFX8-NEXT: s_lshl_b32 s2, s4, s2
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_andn2_b32 s4, 7, s10
; GFX8-NEXT: s_lshr_b32 s3, s3, s4
; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: s_and_b32 s3, s11, 7
-; GFX8-NEXT: s_lshl_b32 s3, s5, s3
-; GFX8-NEXT: s_andn2_b32 s5, 7, s11
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_lshl_b32 s3, s5, s3
; GFX8-NEXT: s_lshr_b32 s4, s8, 1
-; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT: s_andn2_b32 s5, 7, s11
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
; GFX8-NEXT: s_lshr_b32 s4, s4, s5
@@ -1088,11 +1032,8 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_lshr_b32 s10, s2, 16
; GFX9-NEXT: s_lshr_b32 s11, s2, 24
; GFX9-NEXT: s_and_b32 s12, s2, 7
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
-; GFX9-NEXT: s_and_b32 s12, 0xffff, s12
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_andn2_b32 s2, 7, s2
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s5, s0, 24
@@ -1100,29 +1041,24 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s9, 7
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_and_b32 s2, s6, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s3, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_andn2_b32 s3, 7, s9
; GFX9-NEXT: s_lshr_b32 s2, s2, 1
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_andn2_b32 s3, 7, s9
; GFX9-NEXT: s_lshr_b32 s2, s2, s3
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s2, s10, 7
-; GFX9-NEXT: s_lshl_b32 s2, s4, s2
; GFX9-NEXT: s_and_b32 s3, s7, 0xff
-; GFX9-NEXT: s_andn2_b32 s4, 7, s10
+; GFX9-NEXT: s_lshl_b32 s2, s4, s2
; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX9-NEXT: s_andn2_b32 s4, 7, s10
; GFX9-NEXT: s_lshr_b32 s3, s3, s4
; GFX9-NEXT: s_or_b32 s2, s2, s3
; GFX9-NEXT: s_and_b32 s3, s11, 7
-; GFX9-NEXT: s_lshl_b32 s3, s5, s3
-; GFX9-NEXT: s_andn2_b32 s5, 7, s11
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_lshl_b32 s3, s5, s3
; GFX9-NEXT: s_lshr_b32 s4, s8, 1
-; GFX9-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX9-NEXT: s_andn2_b32 s5, 7, s11
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
; GFX9-NEXT: s_lshr_b32 s4, s4, s5
@@ -1146,41 +1082,33 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX10-NEXT: s_lshr_b32 s10, s2, 16
; GFX10-NEXT: s_lshr_b32 s11, s2, 24
; GFX10-NEXT: s_and_b32 s12, s2, 7
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
+; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_lshr_b32 s3, s0, 8
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
-; GFX10-NEXT: s_and_b32 s2, s9, 7
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_and_b32 s2, s6, 0xff
+; GFX10-NEXT: s_and_b32 s6, s9, 7
+; GFX10-NEXT: s_lshr_b32 s2, s2, 1
; GFX10-NEXT: s_andn2_b32 s9, 7, s9
-; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_lshr_b32 s6, s6, 1
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s9
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
; GFX10-NEXT: s_lshl_b32 s0, s0, s12
-; GFX10-NEXT: s_lshl_b32 s2, s3, s2
-; GFX10-NEXT: s_lshr_b32 s3, s6, s9
+; GFX10-NEXT: s_lshl_b32 s3, s3, s6
+; GFX10-NEXT: s_lshr_b32 s2, s2, s9
; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_or_b32 s1, s2, s3
-; GFX10-NEXT: s_and_b32 s3, s7, 0xff
+; GFX10-NEXT: s_or_b32 s1, s3, s2
+; GFX10-NEXT: s_and_b32 s2, s7, 0xff
+; GFX10-NEXT: s_and_b32 s3, s10, 7
+; GFX10-NEXT: s_lshr_b32 s2, s2, 1
; GFX10-NEXT: s_andn2_b32 s6, 7, s10
-; GFX10-NEXT: s_lshr_b32 s3, s3, 1
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_and_b32 s2, s10, 7
-; GFX10-NEXT: s_lshr_b32 s3, s3, s6
-; GFX10-NEXT: s_andn2_b32 s6, 7, s11
-; GFX10-NEXT: s_lshl_b32 s2, s4, s2
+; GFX10-NEXT: s_lshl_b32 s3, s4, s3
+; GFX10-NEXT: s_lshr_b32 s2, s2, s6
; GFX10-NEXT: s_and_b32 s4, s11, 7
-; GFX10-NEXT: s_lshr_b32 s7, s8, 1
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_lshr_b32 s6, s8, 1
+; GFX10-NEXT: s_andn2_b32 s7, 7, s11
; GFX10-NEXT: s_lshl_b32 s4, s5, s4
-; GFX10-NEXT: s_lshr_b32 s5, s7, s6
-; GFX10-NEXT: s_or_b32 s2, s2, s3
+; GFX10-NEXT: s_lshr_b32 s5, s6, s7
+; GFX10-NEXT: s_or_b32 s2, s3, s2
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_or_b32 s3, s4, s5
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
@@ -1204,41 +1132,33 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX11-NEXT: s_lshr_b32 s10, s2, 16
; GFX11-NEXT: s_lshr_b32 s11, s2, 24
; GFX11-NEXT: s_and_b32 s12, s2, 7
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
+; GFX11-NEXT: s_an...
[truncated]
|
ee03ddb
to
8417533
Compare
a9f0563
to
1af8346
Compare
arsenm
approved these changes
Mar 14, 2025
8417533
to
c6f7caf
Compare
Merge activity
|
858234d
to
3635048
Compare
c6f7caf
to
4f6fa07
Compare
3635048
to
7559408
Compare
Base automatically changed from
users/pierre-vh/gisel-bitwise-i32-promo
to
main
March 17, 2025 09:23
…bines We can add a bunch of exts/truncs during RBSelect, we should be able to fold them away afterwards.
4f6fa07
to
f4502da
Compare
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
We can add a bunch of exts/truncs during RBSelect, we should be able to fold
them away afterwards.