Skip to content

Commit 212b78a

Browse files
authored
DAG: Improve fminimum/fmaximum vector expansion logic (#93579)
First, expandFMINIMUM_FMAXIMUM should be a never-fail API. The client wanted it expanded, and it can always be expanded. This logic was tied up with what the VectorLegalizer wanted. Prefer using the min/max opcodes, and unrolling if we don't have a vselect. This seems to produce better code in all the changed tests.
1 parent ab33fa5 commit 212b78a

11 files changed

+4758
-16271
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -992,11 +992,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
992992
break;
993993
case ISD::FMINIMUM:
994994
case ISD::FMAXIMUM:
995-
if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)) {
996-
Results.push_back(Expanded);
997-
return;
998-
}
999-
break;
995+
Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG));
996+
return;
1000997
case ISD::SMIN:
1001998
case ISD::SMAX:
1002999
case ISD::UMIN:

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8427,10 +8427,6 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
84278427
bool IsMax = Opc == ISD::FMAXIMUM;
84288428
SDNodeFlags Flags = N->getFlags();
84298429

8430-
if (VT.isVector() &&
8431-
isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType()))
8432-
return SDValue();
8433-
84348430
// First, implement comparison not propagating NaN. If no native fmin or fmax
84358431
// available, use plain select with setcc instead.
84368432
SDValue MinMax;
@@ -8447,6 +8443,9 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
84478443
} else if (isOperationLegalOrCustom(CompOpc, VT)) {
84488444
MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags);
84498445
} else {
8446+
if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
8447+
return DAG.UnrollVectorOp(N);
8448+
84508449
// NaN (if exists) will be propagated later, so orderness doesn't matter.
84518450
SDValue Compare =
84528451
DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);

llvm/test/CodeGen/AMDGPU/fmaximum3.ll

Lines changed: 120 additions & 480 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/fminimum3.ll

Lines changed: 120 additions & 480 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll

Lines changed: 17 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -554,28 +554,14 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
554554
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555555
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
556556
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
557-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
558-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
557+
; GFX8-NEXT: v_max_f16_e32 v4, v3, v2
559558
; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00
560559
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2
561-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
562-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v3, 64
563-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
564-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v2, 64
565-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
566-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v4
567-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
568-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
569-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
560+
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
561+
; GFX8-NEXT: v_max_f16_e32 v3, v0, v1
570562
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
571-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
572-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v0, 64
573-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
574-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v1, 64
575-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
576-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3
577563
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
578-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
564+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
579565
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
580566
; GFX8-NEXT: s_setpc_b64 s[30:31]
581567
;
@@ -669,26 +655,9 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
669655
; GFX8-LABEL: v_maximum_v2f16__nnan:
670656
; GFX8: ; %bb.0:
671657
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
672-
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
673-
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
674-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
675-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
676-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v3, 64
677-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
678-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v2, 64
679-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
680-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v4
681-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
682-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
683-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
684-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v0, 64
685-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
686-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v1, 64
687-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
688-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3
689-
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
690-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
691-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
658+
; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
659+
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
660+
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
692661
; GFX8-NEXT: s_setpc_b64 s[30:31]
693662
;
694663
; GFX9-LABEL: v_maximum_v2f16__nnan:
@@ -754,13 +723,11 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
754723
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
755724
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
756725
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
757-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
758-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
726+
; GFX8-NEXT: v_max_f16_e32 v4, v3, v2
759727
; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00
760728
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2
761729
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
762-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
763-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
730+
; GFX8-NEXT: v_max_f16_e32 v3, v0, v1
764731
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
765732
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
766733
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
@@ -857,14 +824,9 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
857824
; GFX8-LABEL: v_maximum_v2f16__nnan_nsz:
858825
; GFX8: ; %bb.0:
859826
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860-
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
861-
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
862-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
863-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
864-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
865-
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
866-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
867-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
827+
; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
828+
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
829+
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
868830
; GFX8-NEXT: s_setpc_b64 s[30:31]
869831
;
870832
; GFX9-LABEL: v_maximum_v2f16__nnan_nsz:
@@ -938,31 +900,15 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
938900
; GFX8-NEXT: s_lshr_b32 s6, s5, 16
939901
; GFX8-NEXT: s_lshr_b32 s7, s4, 16
940902
; GFX8-NEXT: v_mov_b32_e32 v0, s6
941-
; GFX8-NEXT: v_mov_b32_e32 v1, s7
942-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, s7, v0
943-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
944-
; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00
903+
; GFX8-NEXT: v_max_f16_e32 v1, s7, v0
904+
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
945905
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0
946-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
947-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s7, 64
948-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
949-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s6, 64
950-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
951-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v2
906+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
952907
; GFX8-NEXT: v_mov_b32_e32 v1, s5
953-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
954-
; GFX8-NEXT: v_mov_b32_e32 v2, s4
955-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, s4, v1
956-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc
908+
; GFX8-NEXT: v_max_f16_e32 v3, s4, v1
957909
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1
958-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
959-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s4, 64
960-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
961-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s5, 64
962-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
963-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3
964910
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
965-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
911+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
966912
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
967913
; GFX8-NEXT: ;;#ASMSTART
968914
; GFX8-NEXT: ; use v0

0 commit comments

Comments
 (0)