-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Fix overly conservative immediate operand check #127563
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Fix overly conservative immediate operand check #127563
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThe real legality check is peformed later anyway, so this was This also stops folding s_fmac_f32 to s_fmamk_f32 in a few tests, Full diff: https://github.com/llvm/llvm-project/pull/127563.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 84773349e0ca0..cbd858b9002ee 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -830,7 +830,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
if (UseOpIdx >= Desc.getNumOperands())
return false;
- if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx))
+ // Filter out unhandled pseudos.
+ if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
return false;
uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 4be00fedb972e..89078f20f1d47 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -920,9 +920,7 @@ define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
-; GFX6-NEXT: s_mov_b32 s4, -1
-; GFX6-NEXT: s_mov_b32 s5, s4
-; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GFX6-NEXT: ; return to shader part epilog
;
@@ -962,9 +960,7 @@ define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inr
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
-; GFX6-NEXT: s_mov_b32 s4, -1
-; GFX6-NEXT: s_mov_b32 s5, s4
-; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GFX6-NEXT: ; return to shader part epilog
;
@@ -1004,9 +1000,7 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
-; GFX6-NEXT: s_mov_b32 s4, -1
-; GFX6-NEXT: s_mov_b32 s5, s4
-; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GFX6-NEXT: ; return to shader part epilog
;
@@ -1060,9 +1054,7 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg
; GFX6-NEXT: s_lshl_b32 s5, s13, 16
; GFX6-NEXT: s_and_b32 s6, s12, 0xffff
; GFX6-NEXT: s_or_b32 s5, s5, s6
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: s_mov_b32 s7, s6
-; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index e7119c89ac06c..065fadf3b5ef3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -919,9 +919,7 @@ define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1)
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
-; GFX6-NEXT: s_mov_b32 s4, -1
-; GFX6-NEXT: s_mov_b32 s5, s4
-; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX6-NEXT: ; return to shader part epilog
;
@@ -961,9 +959,7 @@ define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inre
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
-; GFX6-NEXT: s_mov_b32 s4, -1
-; GFX6-NEXT: s_mov_b32 s5, s4
-; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX6-NEXT: ; return to shader part epilog
;
@@ -1003,9 +999,7 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
-; GFX6-NEXT: s_mov_b32 s4, -1
-; GFX6-NEXT: s_mov_b32 s5, s4
-; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX6-NEXT: ; return to shader part epilog
;
@@ -1059,9 +1053,7 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %
; GFX6-NEXT: s_lshl_b32 s5, s13, 16
; GFX6-NEXT: s_and_b32 s6, s12, 0xffff
; GFX6-NEXT: s_or_b32 s5, s5, s6
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: s_mov_b32 s7, s6
-; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index ed85fb19d9051..43322b1e23412 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -118,13 +118,11 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
; GFX7-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX7-NEXT: s_mov_b32 s8, -1
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_lshl_b32 s1, s3, 16
; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX7-NEXT: s_mov_b32 s9, s8
; GFX7-NEXT: s_or_b32 s1, s1, s2
-; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9]
+; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], -1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: scalar_xnor_v4i16_one_use:
diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
index f6fc69a6e3e47..ea93e3ac1e595 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
@@ -5,16 +5,14 @@ define amdgpu_cs <2 x i32> @f() {
; CHECK-LABEL: f:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: s_mov_b32 s5, s4
; CHECK-NEXT: s_mov_b32 s6, s4
; CHECK-NEXT: s_mov_b32 s7, s4
-; CHECK-NEXT: s_mov_b32 s0, s4
; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; CHECK-NEXT: s_mov_b32 s1, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[0:1], v[0:1]
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index 4011c21af6904..661af021e8a84 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -192,10 +192,8 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b)
;
; GISEL-LABEL: s_csh_v4i32:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_mov_b32 s8, 31
-; GISEL-NEXT: s_mov_b32 s9, s8
-; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
-; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9]
+; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], 31
+; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], 31
; GISEL-NEXT: s_lshl_b32 s8, s0, s4
; GISEL-NEXT: s_lshl_b32 s9, s1, s5
; GISEL-NEXT: s_lshl_b32 s10, s2, s6
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir
index 08693ec9db1d4..aeca4398f9a83 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir
@@ -13,7 +13,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAC_F32 1056964608, [[COPY]], [[COPY1]], implicit $mode
; CHECK-NEXT: $sgpr0 = COPY %fma
%0:sreg_32 = COPY $sgpr0
%1:sreg_32 = COPY $sgpr1
@@ -33,7 +33,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAC_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode
; CHECK-NEXT: $sgpr0 = COPY %fma
%0:sreg_32 = COPY $sgpr0
%1:sreg_32 = COPY $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 492a30b67089c..bc49f70cbee11 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -742,10 +742,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1)
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_mov_b32 s0, 1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: s_mov_b32 s1, s0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], 1
; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index b59f3c0d410f8..9b03a72fd826d 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -87,7 +87,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
; GCN-LABEL: {{^}}fadd_v2_v_lit_splat:
; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0{{$}}
define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -308,7 +308,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
; GCN-LABEL: {{^}}fmul_v2_v_lit_splat:
; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0{{$}}
define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -432,7 +432,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
; GCN-LABEL: {{^}}fma_v2_v_lit_splat:
; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
-; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
+; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0{{$}}
define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -556,8 +556,8 @@ bb:
; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0{{$}}
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0{{$}}
define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
bb:
%i12 = fadd <2 x float> zeroinitializer, %arg
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
index 81d792183dc06..debbfce7dadcc 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
@@ -218,7 +218,7 @@ define amdgpu_ps float @_amdgpu_ps_main() {
; GFX1150-NEXT: s_mov_b32 s3, s0
; GFX1150-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0
+; GFX1150-NEXT: s_fmac_f32 s0, s1, 4.0
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-NEXT: v_mov_b32_e32 v0, s0
; GFX1150-NEXT: ; return to shader part epilog
@@ -232,7 +232,7 @@ define amdgpu_ps float @_amdgpu_ps_main() {
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0
+; GFX12-NEXT: s_fmac_f32 s0, s1, 4.0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
|
b28280b
to
2f11ad0
Compare
3dd61c6
to
2f31f25
Compare
2f11ad0
to
1d3dce0
Compare
2f31f25
to
2903c24
Compare
1d3dce0
to
e9a741f
Compare
2903c24
to
5501596
Compare
It might be better to form s_fmamk_f32 pre-RA, since that gives RA the freedom to use different physical registers for dst and src2 if it wants to. Then we could shrink back to s_fmac_f32 post-RA if the physical registers are the same. |
I tried to do this a few times and it was always worse. The three address form is a stronger hint to RA than anything else |
5501596
to
f3accf6
Compare
e9a741f
to
7e136af
Compare
7e136af
to
0edaa87
Compare
The real legality check is peformed later anyway, so this was unnecessarily blocking immediate folds in handled cases. This also stops folding s_fmac_f32 to s_fmamk_f32 in a few tests, but that seems better. The globalisel changes look suspicious, it may be mishandling constants for VOP3P instructions.
f3accf6
to
9c076c0
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/66/builds/10457 Here is the relevant piece of the build log for the reference
|
The real legality check is peformed later anyway, so this was unnecessarily blocking immediate folds in handled cases. This also stops folding s_fmac_f32 to s_fmamk_f32 in a few tests, but that seems better. The globalisel changes look suspicious, it may be mishandling constants for VOP3P instructions.
The real legality check is peformed later anyway, so this was
unnecessarily blocking immediate folds in handled cases.
This also stops folding s_fmac_f32 to s_fmamk_f32 in a few tests,
but that seems better. The globalisel changes look suspicious,
it may be mishandling constants for VOP3P instructions.