Skip to content

Commit cf2b4e0

Browse files
RKSimonvg0204
authored andcommitted
[X86] isHorizontalBinOp - always create HADD/SUB if it will be merged with another existing HADD/SUB
Fixes some more cases from llvm#34072 where undemanded vector elements prevent HADD/SUB being matched on slow targets
1 parent c9cd43a commit cf2b4e0

File tree

2 files changed

+32
-57
lines changed

2 files changed

+32
-57
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51571,7 +51571,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
5157151571
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
5157251572
SelectionDAG &DAG, const X86Subtarget &Subtarget,
5157351573
bool IsCommutative,
51574-
SmallVectorImpl<int> &PostShuffleMask) {
51574+
SmallVectorImpl<int> &PostShuffleMask,
51575+
bool ForceHorizOp) {
5157551576
// If either operand is undef, bail out. The binop should be simplified.
5157651577
if (LHS.isUndef() || RHS.isUndef())
5157751578
return false;
@@ -51734,13 +51735,12 @@ static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
5173451735

5173551736
// If the source nodes are already used in HorizOps then always accept this.
5173651737
// Shuffle folding should merge these back together.
51737-
bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
51738+
auto FoundHorizUser = [&](SDNode *User) {
5173851739
return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51739-
});
51740-
bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
51741-
return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51742-
});
51743-
bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
51740+
};
51741+
ForceHorizOp =
51742+
ForceHorizOp || (llvm::any_of(NewLHS->uses(), FoundHorizUser) &&
51743+
llvm::any_of(NewRHS->uses(), FoundHorizUser));
5174451744

5174551745
// Assume a SingleSource HOP if we only shuffle one input and don't need to
5174651746
// shuffle the result.
@@ -51763,6 +51763,13 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
5176351763
bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
5176451764
SmallVector<int, 8> PostShuffleMask;
5176551765

51766+
auto MergableHorizOp = [N](unsigned HorizOpcode) {
51767+
return N->hasOneUse() &&
51768+
N->use_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
51769+
(N->use_begin()->getOperand(0).getOpcode() == HorizOpcode ||
51770+
N->use_begin()->getOperand(1).getOpcode() == HorizOpcode);
51771+
};
51772+
5176651773
switch (Opcode) {
5176751774
case ISD::FADD:
5176851775
case ISD::FSUB:
@@ -51772,7 +51779,7 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
5177251779
SDValue RHS = N->getOperand(1);
5177351780
auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
5177451781
if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51775-
PostShuffleMask)) {
51782+
PostShuffleMask, MergableHorizOp(HorizOpcode))) {
5177651783
SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
5177751784
if (!PostShuffleMask.empty())
5177851785
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
@@ -51789,7 +51796,7 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
5178951796
SDValue RHS = N->getOperand(1);
5179051797
auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
5179151798
if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51792-
PostShuffleMask)) {
51799+
PostShuffleMask, MergableHorizOp(HorizOpcode))) {
5179351800
auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
5179451801
ArrayRef<SDValue> Ops) {
5179551802
return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);

llvm/test/CodeGen/X86/haddsub-undef.ll

Lines changed: 16 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,31 +1052,15 @@ define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
10521052
}
10531053

10541054
define <4 x float> @PR34724_add_v4f32_01u3(<4 x float> %0, <4 x float> %1) {
1055-
; SSE-SLOW-LABEL: PR34724_add_v4f32_01u3:
1056-
; SSE-SLOW: # %bb.0:
1057-
; SSE-SLOW-NEXT: haddps %xmm0, %xmm0
1058-
; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
1059-
; SSE-SLOW-NEXT: addps %xmm1, %xmm2
1060-
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1061-
; SSE-SLOW-NEXT: retq
1062-
;
1063-
; SSE-FAST-LABEL: PR34724_add_v4f32_01u3:
1064-
; SSE-FAST: # %bb.0:
1065-
; SSE-FAST-NEXT: haddps %xmm1, %xmm0
1066-
; SSE-FAST-NEXT: retq
1067-
;
1068-
; AVX-SLOW-LABEL: PR34724_add_v4f32_01u3:
1069-
; AVX-SLOW: # %bb.0:
1070-
; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1071-
; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
1072-
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
1073-
; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1074-
; AVX-SLOW-NEXT: retq
1055+
; SSE-LABEL: PR34724_add_v4f32_01u3:
1056+
; SSE: # %bb.0:
1057+
; SSE-NEXT: haddps %xmm1, %xmm0
1058+
; SSE-NEXT: retq
10751059
;
1076-
; AVX-FAST-LABEL: PR34724_add_v4f32_01u3:
1077-
; AVX-FAST: # %bb.0:
1078-
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1079-
; AVX-FAST-NEXT: retq
1060+
; AVX-LABEL: PR34724_add_v4f32_01u3:
1061+
; AVX: # %bb.0:
1062+
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1063+
; AVX-NEXT: retq
10801064
%3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
10811065
%4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
10821066
%5 = fadd <2 x float> %3, %4
@@ -1088,31 +1072,15 @@ define <4 x float> @PR34724_add_v4f32_01u3(<4 x float> %0, <4 x float> %1) {
10881072
}
10891073

10901074
define <4 x float> @PR34724_add_v4f32_012u(<4 x float> %0, <4 x float> %1) {
1091-
; SSE-SLOW-LABEL: PR34724_add_v4f32_012u:
1092-
; SSE-SLOW: # %bb.0:
1093-
; SSE-SLOW-NEXT: haddps %xmm0, %xmm0
1094-
; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1095-
; SSE-SLOW-NEXT: addps %xmm1, %xmm2
1096-
; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1097-
; SSE-SLOW-NEXT: retq
1098-
;
1099-
; SSE-FAST-LABEL: PR34724_add_v4f32_012u:
1100-
; SSE-FAST: # %bb.0:
1101-
; SSE-FAST-NEXT: haddps %xmm1, %xmm0
1102-
; SSE-FAST-NEXT: retq
1103-
;
1104-
; AVX-SLOW-LABEL: PR34724_add_v4f32_012u:
1105-
; AVX-SLOW: # %bb.0:
1106-
; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1107-
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1108-
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
1109-
; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1110-
; AVX-SLOW-NEXT: retq
1075+
; SSE-LABEL: PR34724_add_v4f32_012u:
1076+
; SSE: # %bb.0:
1077+
; SSE-NEXT: haddps %xmm1, %xmm0
1078+
; SSE-NEXT: retq
11111079
;
1112-
; AVX-FAST-LABEL: PR34724_add_v4f32_012u:
1113-
; AVX-FAST: # %bb.0:
1114-
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1115-
; AVX-FAST-NEXT: retq
1080+
; AVX-LABEL: PR34724_add_v4f32_012u:
1081+
; AVX: # %bb.0:
1082+
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1083+
; AVX-NEXT: retq
11161084
%3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
11171085
%4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
11181086
%5 = fadd <2 x float> %3, %4

0 commit comments

Comments
 (0)