Skip to content

AMDGPU: Fix creating minimum3/maximum3 nodes pre-gfx12 #93027

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 23, 2024

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented May 22, 2024

These would fail to select.

@llvmbot
Copy link
Member

llvmbot commented May 22, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

These would fail to select.


Patch is 252.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93027.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3)
  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+28-4)
  • (modified) llvm/test/CodeGen/AMDGPU/fmaximum3.ll (+2839-93)
  • (modified) llvm/test/CodeGen/AMDGPU/fminimum3.ll (+2839-93)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b7548671f2c54..db5b467f22389 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1312,6 +1312,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // \returns true if the target has IEEE fminimum/fmaximum instructions
   bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
 
+  // \returns true if the target has IEEE fminimum3/fmaximum3 instructions
+  bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
+
   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7fe6c2d0db8f5..1d2a5fff23568 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13199,6 +13199,33 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
   return SDValue();
 }
 
+/// \return true if the subtarget supports minimum3 and maximum3 with the given
+/// base min/max opcode \p Opc for type \p VT.
+static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
+                             EVT VT) {
+  switch (Opc) {
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY:
+    return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+    return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::UMAX:
+  case ISD::UMIN:
+    return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
+  default:
+    return false;
+  }
+
+  llvm_unreachable("not a min/max opcode");
+}
+
 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -13211,10 +13238,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
-      !VT.isVector() &&
-      (VT == MVT::i32 || VT == MVT::f32 ||
-       ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
+  if (supportsMin3Max3(*Subtarget, Opc, VT)) {
     // max(max(a, b), c) -> max3(a, b, c)
     // min(min(a, b), c) -> min3(a, b, c)
     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 6e45084dc4b80..3caebacb187a5 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,98 +1,2844 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f32:
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.maximum.f32(float %a, float %b)
-  %f1 = call float @llvm.maximum.f32(float %f0, float %c)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; Commute operand of second fmaximum
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f32:
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.maximum.f32(float %a, float %b)
-  %f1 = call float @llvm.maximum.f32(float %c, float %f0)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f16:
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.maximum.f16(half %a, half %b)
-  %f1 = call half @llvm.maximum.f16(half %f0, half %c)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f16:
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.maximum.f16(half %a, half %b)
-  %f1 = call half @llvm.maximum.f16(half %c, half %f0)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3
-; since there are no pack instructions for fmaximum3.
-; GCN-LABEL: {{^}}no_fmaximum3_v2f16:
-; GCN: v_pk_maximum_f16 v0, v0, v1
-; GCN: v_pk_maximum_f16 v0, v2, v0
-; GCN: v_pk_maximum_f16 v0, v0, v3
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @no_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
-entry:
-  %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
-  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max)
-  %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max1, <2 x half> %d)
-  ret <2 x half> %res
-}
-
-; GCN-LABEL: {{^}}no_fmaximum3_olt_0_f64:
-; GCN-COUNT-2: v_maximum_f64
-define amdgpu_kernel void @no_fmaximum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile double, ptr addrspace(1) %aptr, align 4
-  %b = load volatile double, ptr addrspace(1) %bptr, align 4
-  %c = load volatile double, ptr addrspace(1) %cptr, align 4
-  %f0 = call double @llvm.maximum.f64(double %a, double %b)
-  %f1 = call double @llvm.maximum.f64(double %f0, double %c)
-  store double %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 
 declare double @llvm.maximum.f64(double, double)
 declare float @llvm.maximum.f32(float, float)
 declare half @llvm.maximum.f16(half, half)
 declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>)
+declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+define float @v_fmaximum3_f32(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %c, float %max0)
+  ret float %max1
+}
+
+define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum3_f32 v0, s0, s1, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  %cast = bitcast float %max1 to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readfirstlane
+}
+
+define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %max0 = call float @llvm.maximum.f32(float %a, float %b.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %b.fneg = fneg float %b
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b.fneg)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %a.fneg.fabs = fneg float %a.fabs
+  %b.fneg.fabs = fneg float %b.fabs
+  %c.fneg.fabs = fneg float %c.fabs
+  %max0 = call float @llvm.maximum.f32(float %a.fneg.fabs, float %b.fneg.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -...
[truncated]

@arsenm arsenm force-pushed the amdgpu-no-minimum3-maximum3 branch 3 times, most recently from 558df80 to 6d4a7ec Compare May 22, 2024 13:46
@Sisyph
Copy link
Contributor

Sisyph commented May 22, 2024

Can you fix/test GISel as well?

@arsenm
Copy link
Contributor Author

arsenm commented May 22, 2024

Can you fix/test GISel as well?

It doesn't have the maximum3/minimum3 combine right now, so there's nothing to do yet

Copy link
Collaborator

@rampitec rampitec left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@arsenm arsenm force-pushed the amdgpu-no-minimum3-maximum3 branch from 6d4a7ec to 3fec245 Compare May 23, 2024 13:31
@arsenm arsenm merged commit 2401b61 into llvm:main May 23, 2024
3 checks passed
@arsenm arsenm deleted the amdgpu-no-minimum3-maximum3 branch May 23, 2024 13:59
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants