Skip to content

Commit f1296cd

Browse files
committed
DAG: Improve fminimum/fmaximum vector expansion logic
First, expandFMINIMUM_FMAXIMUM should be a never-fail API. The client wanted it expanded, and it can always be expanded. This logic was tied up with what the VectorLegalizer wanted. Prefer using the min/max opcodes, and unrolling if we don't have a vselect. This seems to produce better code in all the changed tests.
1 parent aef0bdd commit f1296cd

11 files changed

+4758
-16271
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -990,11 +990,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
990990
break;
991991
case ISD::FMINIMUM:
992992
case ISD::FMAXIMUM:
993-
if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)) {
994-
Results.push_back(Expanded);
995-
return;
996-
}
997-
break;
993+
Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG));
994+
return;
998995
case ISD::SMIN:
999996
case ISD::SMAX:
1000997
case ISD::UMIN:

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8430,10 +8430,6 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
84308430
bool IsMax = Opc == ISD::FMAXIMUM;
84318431
SDNodeFlags Flags = N->getFlags();
84328432

8433-
if (VT.isVector() &&
8434-
isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType()))
8435-
return SDValue();
8436-
84378433
// First, implement comparison not propagating NaN. If no native fmin or fmax
84388434
// available, use plain select with setcc instead.
84398435
SDValue MinMax;
@@ -8450,6 +8446,9 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
84508446
} else if (isOperationLegalOrCustom(CompOpc, VT)) {
84518447
MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags);
84528448
} else {
8449+
if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
8450+
return DAG.UnrollVectorOp(N);
8451+
84538452
// NaN (if exists) will be propagated later, so orderness doesn't matter.
84548453
SDValue Compare =
84558454
DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);

llvm/test/CodeGen/AMDGPU/fmaximum3.ll

Lines changed: 120 additions & 480 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/fminimum3.ll

Lines changed: 120 additions & 480 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll

Lines changed: 17 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -554,28 +554,14 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
554554
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555555
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
556556
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
557-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
558-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
557+
; GFX8-NEXT: v_max_f16_e32 v4, v3, v2
559558
; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00
560559
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2
561-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
562-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v3, 64
563-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
564-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v2, 64
565-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
566-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v4
567-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
568-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
569-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
560+
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
561+
; GFX8-NEXT: v_max_f16_e32 v3, v0, v1
570562
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
571-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
572-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v0, 64
573-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
574-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v1, 64
575-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
576-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3
577563
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
578-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
564+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
579565
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
580566
; GFX8-NEXT: s_setpc_b64 s[30:31]
581567
;
@@ -674,26 +660,9 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
674660
; GFX8-LABEL: v_maximum_v2f16__nnan:
675661
; GFX8: ; %bb.0:
676662
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
677-
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
678-
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
679-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
680-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
681-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v3, 64
682-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
683-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v2, 64
684-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
685-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v4
686-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
687-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
688-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
689-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v0, 64
690-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
691-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v1, 64
692-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
693-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3
694-
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
695-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
696-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
663+
; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
664+
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
665+
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
697666
; GFX8-NEXT: s_setpc_b64 s[30:31]
698667
;
699668
; GFX9-LABEL: v_maximum_v2f16__nnan:
@@ -759,13 +728,11 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
759728
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
760729
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
761730
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
762-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
763-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
731+
; GFX8-NEXT: v_max_f16_e32 v4, v3, v2
764732
; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00
765733
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2
766734
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
767-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
768-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
735+
; GFX8-NEXT: v_max_f16_e32 v3, v0, v1
769736
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
770737
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
771738
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
@@ -867,14 +834,9 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
867834
; GFX8-LABEL: v_maximum_v2f16__nnan_nsz:
868835
; GFX8: ; %bb.0:
869836
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870-
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
871-
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
872-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
873-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
874-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
875-
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
876-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
877-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
837+
; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
838+
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
839+
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
878840
; GFX8-NEXT: s_setpc_b64 s[30:31]
879841
;
880842
; GFX9-LABEL: v_maximum_v2f16__nnan_nsz:
@@ -948,31 +910,15 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
948910
; GFX8-NEXT: s_lshr_b32 s6, s5, 16
949911
; GFX8-NEXT: s_lshr_b32 s7, s4, 16
950912
; GFX8-NEXT: v_mov_b32_e32 v0, s6
951-
; GFX8-NEXT: v_mov_b32_e32 v1, s7
952-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, s7, v0
953-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
954-
; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00
913+
; GFX8-NEXT: v_max_f16_e32 v1, s7, v0
914+
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
955915
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0
956-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
957-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s7, 64
958-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
959-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s6, 64
960-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
961-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v2
916+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
962917
; GFX8-NEXT: v_mov_b32_e32 v1, s5
963-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
964-
; GFX8-NEXT: v_mov_b32_e32 v2, s4
965-
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, s4, v1
966-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc
918+
; GFX8-NEXT: v_max_f16_e32 v3, s4, v1
967919
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1
968-
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
969-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s4, 64
970-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
971-
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s5, 64
972-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
973-
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3
974920
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
975-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
921+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
976922
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
977923
; GFX8-NEXT: ;;#ASMSTART
978924
; GFX8-NEXT: ; use v0

0 commit comments

Comments
 (0)