-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Fix creating minimum3/maximum3 nodes pre-gfx12 #93027
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThese would fail to select. Patch is 252.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93027.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b7548671f2c54..db5b467f22389 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1312,6 +1312,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// \returns true if the target has IEEE fminimum/fmaximum instructions
bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
+ // \returns true if the target has IEEE fminimum3/fmaximum3 instructions
+ bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
+
// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
bool hasRrWGMode() const { return getGeneration() >= GFX12; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7fe6c2d0db8f5..1d2a5fff23568 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13199,6 +13199,33 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
return SDValue();
}
+/// \return true if the subtarget supports minimum3 and maximum3 with the given
+/// base min/max opcode \p Opc for type \p VT.
+static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
+ EVT VT) {
+ switch (Opc) {
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
+ case AMDGPUISD::FMIN_LEGACY:
+ case AMDGPUISD::FMAX_LEGACY:
+ return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
+ return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
+ case ISD::SMAX:
+ case ISD::SMIN:
+ case ISD::UMAX:
+ case ISD::UMIN:
+ return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
+ default:
+ return false;
+ }
+
+ llvm_unreachable("not a min/max opcode");
+}
+
SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -13211,10 +13238,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
// Only do this if the inner op has one use since this will just increases
// register pressure for no benefit.
- if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
- !VT.isVector() &&
- (VT == MVT::i32 || VT == MVT::f32 ||
- ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
+ if (supportsMin3Max3(*Subtarget, Opc, VT)) {
// max(max(a, b), c) -> max3(a, b, c)
// min(min(a, b), c) -> min3(a, b, c)
if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 6e45084dc4b80..3caebacb187a5 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,98 +1,2844 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f32:
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
- %a = load volatile float, ptr addrspace(1) %aptr, align 4
- %b = load volatile float, ptr addrspace(1) %bptr, align 4
- %c = load volatile float, ptr addrspace(1) %cptr, align 4
- %f0 = call float @llvm.maximum.f32(float %a, float %b)
- %f1 = call float @llvm.maximum.f32(float %f0, float %c)
- store float %f1, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; Commute operand of second fmaximum
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f32:
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
- %a = load volatile float, ptr addrspace(1) %aptr, align 4
- %b = load volatile float, ptr addrspace(1) %bptr, align 4
- %c = load volatile float, ptr addrspace(1) %cptr, align 4
- %f0 = call float @llvm.maximum.f32(float %a, float %b)
- %f1 = call float @llvm.maximum.f32(float %c, float %f0)
- store float %f1, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f16:
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
- %a = load volatile half, ptr addrspace(1) %aptr, align 2
- %b = load volatile half, ptr addrspace(1) %bptr, align 2
- %c = load volatile half, ptr addrspace(1) %cptr, align 2
- %f0 = call half @llvm.maximum.f16(half %a, half %b)
- %f1 = call half @llvm.maximum.f16(half %f0, half %c)
- store half %f1, ptr addrspace(1) %out, align 2
- ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f16:
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
- %a = load volatile half, ptr addrspace(1) %aptr, align 2
- %b = load volatile half, ptr addrspace(1) %bptr, align 2
- %c = load volatile half, ptr addrspace(1) %cptr, align 2
- %f0 = call half @llvm.maximum.f16(half %a, half %b)
- %f1 = call half @llvm.maximum.f16(half %c, half %f0)
- store half %f1, ptr addrspace(1) %out, align 2
- ret void
-}
-
-; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3
-; since there are no pack instructions for fmaximum3.
-; GCN-LABEL: {{^}}no_fmaximum3_v2f16:
-; GCN: v_pk_maximum_f16 v0, v0, v1
-; GCN: v_pk_maximum_f16 v0, v2, v0
-; GCN: v_pk_maximum_f16 v0, v0, v3
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @no_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
-entry:
- %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
- %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max)
- %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max1, <2 x half> %d)
- ret <2 x half> %res
-}
-
-; GCN-LABEL: {{^}}no_fmaximum3_olt_0_f64:
-; GCN-COUNT-2: v_maximum_f64
-define amdgpu_kernel void @no_fmaximum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
- %a = load volatile double, ptr addrspace(1) %aptr, align 4
- %b = load volatile double, ptr addrspace(1) %bptr, align 4
- %c = load volatile double, ptr addrspace(1) %cptr, align 4
- %f0 = call double @llvm.maximum.f64(double %a, double %b)
- %f1 = call double @llvm.maximum.f64(double %f0, double %c)
- store double %f1, ptr addrspace(1) %out, align 4
- ret void
-}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
declare double @llvm.maximum.f64(double, double)
declare float @llvm.maximum.f32(float, float)
declare half @llvm.maximum.f16(half, half)
declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>)
+declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+define float @v_fmaximum3_f32(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_commute:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v2, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_commute:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %c, float %max0)
+ ret float %max1
+}
+
+define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_maximum3_f32 v0, s0, s1, v0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: v_max_f32_e32 v1, s2, v0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ %cast = bitcast float %max1 to i32
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+ ret i32 %readfirstlane
+}
+
+define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs0:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v3, |v0|, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %a.fabs = call float @llvm.fabs.f32(float %a)
+ %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v3, v0, |v1|
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %b.fabs = call float @llvm.fabs.f32(float %b)
+ %max0 = call float @llvm.maximum.f32(float %a, float %b.fabs)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_max_f32_e64 v1, v0, |v2|
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %c.fabs = call float @llvm.fabs.f32(float %c)
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v3, |v0|, |v1|
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_max_f32_e64 v1, v0, |v2|
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %a.fabs = call float @llvm.fabs.f32(float %a)
+ %b.fabs = call float @llvm.fabs.f32(float %b)
+ %c.fabs = call float @llvm.fabs.f32(float %c)
+ %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b.fabs)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v3, -v0, -v1
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_max_f32_e64 v1, v0, -v2
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %a.fneg = fneg float %a
+ %b.fneg = fneg float %b
+ %c.fneg = fneg float %c
+ %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b.fneg)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_max_f32_e64 v1, v0, -|v2|
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %a.fabs = call float @llvm.fabs.f32(float %a)
+ %b.fabs = call float @llvm.fabs.f32(float %b)
+ %c.fabs = call float @llvm.fabs.f32(float %c)
+ %a.fneg.fabs = fneg float %a.fabs
+ %b.fneg.fabs = fneg float %b.fabs
+ %c.fneg.fabs = fneg float %c.fabs
+ %max0 = call float @llvm.maximum.f32(float %a.fneg.fabs, float %b.fneg.fabs)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg.fabs)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg0:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v3, -v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %a.fneg = fneg float %a
+ %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v3, v0, -v1
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -...
[truncated]
|
558df80
to
6d4a7ec
Compare
Can you fix/test GISel as well? |
It doesn't have the maximum3/minimum3 combine right now, so there's nothing to do yet |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
These would fail to select.
6d4a7ec
to
3fec245
Compare
These would fail to select.