Skip to content

Commit 3ae2bf2

Browse files
lukel97kostasalv
authored andcommitted
[VectorCombine] Scalarize binop-like intrinsics (llvm#138095)
Currently VectorCombine can scalarize vector compares and binary ops. This extends it to also scalarize binary-op like intrinsics like umax, minnum etc. The motivation behind this is to scalarize more intrinsics in VectorCombine rather than in DAGCombine, so we can sink splats across basic blocks: see llvm#137786 This currently has very little effect on generated code because InstCombine doesn't yet canonicalize binary intrinsics where one operand is a constant into the form that VectorCombine expects, i.e. `binop (shuffle insert) const --> shuffle (binop insert const)`. The plan is to land this first and then in a subsequent patch teach InstCombine to do the canonicalization to avoid regressions in the meantime. This uses `isTriviallyVectorizable` to determine whether or not an intrinsic is safe to scalarize. There's also `isTriviallyScalarizable`, but this seems more geared towards the Scalarizer pass and includes intrinsics with multiple return values. It also only handles intrinsics with two operands with the same type as the return type. In the future we would generalize this to handle arbitrary numbers of operands, including unary operators too, e.g. fneg or fma, as well as different operand types, e.g. powi or scmp
1 parent e567ef8 commit 3ae2bf2

File tree

3 files changed

+200
-16
lines changed

3 files changed

+200
-16
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 52 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
4848
STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
4949
STATISTIC(NumScalarBO, "Number of scalar binops formed");
5050
STATISTIC(NumScalarCmp, "Number of scalar compares formed");
51+
STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
5152

5253
static cl::opt<bool> DisableVectorCombine(
5354
"disable-vector-combine", cl::init(false), cl::Hidden,
@@ -1016,21 +1017,34 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
10161017
return true;
10171018
}
10181019

1019-
/// Match a vector binop or compare instruction with at least one inserted
1020-
/// scalar operand and convert to scalar binop/cmp followed by insertelement.
1020+
/// Match a vector binop, compare or binop-like intrinsic with at least one
1021+
/// inserted scalar operand and convert to scalar binop/cmp/intrinsic followed
1022+
/// by insertelement.
10211023
bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
10221024
CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
10231025
Value *Ins0, *Ins1;
10241026
if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
1025-
!match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
1026-
return false;
1027+
!match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
1028+
// TODO: Allow unary and ternary intrinsics
1029+
// TODO: Allow intrinsics with different argument types
1030+
// TODO: Allow intrinsics with scalar arguments
1031+
if (auto *II = dyn_cast<IntrinsicInst>(&I);
1032+
II && II->arg_size() == 2 &&
1033+
isTriviallyVectorizable(II->getIntrinsicID()) &&
1034+
all_of(II->args(),
1035+
[&II](Value *Arg) { return Arg->getType() == II->getType(); })) {
1036+
Ins0 = II->getArgOperand(0);
1037+
Ins1 = II->getArgOperand(1);
1038+
} else {
1039+
return false;
1040+
}
1041+
}
10271042

10281043
// Do not convert the vector condition of a vector select into a scalar
10291044
// condition. That may cause problems for codegen because of differences in
10301045
// boolean formats and register-file transfers.
10311046
// TODO: Can we account for that in the cost model?
1032-
bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
1033-
if (IsCmp)
1047+
if (isa<CmpInst>(I))
10341048
for (User *U : I.users())
10351049
if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
10361050
return false;
@@ -1085,15 +1099,25 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
10851099

10861100
unsigned Opcode = I.getOpcode();
10871101
InstructionCost ScalarOpCost, VectorOpCost;
1088-
if (IsCmp) {
1102+
if (isa<CmpInst>(I)) {
10891103
CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
10901104
ScalarOpCost = TTI.getCmpSelInstrCost(
10911105
Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
10921106
VectorOpCost = TTI.getCmpSelInstrCost(
10931107
Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1094-
} else {
1108+
} else if (isa<BinaryOperator>(I)) {
10951109
ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
10961110
VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1111+
} else {
1112+
auto *II = cast<IntrinsicInst>(&I);
1113+
IntrinsicCostAttributes ScalarICA(
1114+
II->getIntrinsicID(), ScalarTy,
1115+
SmallVector<Type *>(II->arg_size(), ScalarTy));
1116+
ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1117+
IntrinsicCostAttributes VectorICA(
1118+
II->getIntrinsicID(), VecTy,
1119+
SmallVector<Type *>(II->arg_size(), VecTy));
1120+
VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
10971121
}
10981122

10991123
// Get cost estimate for the insert element. This cost will factor into
@@ -1112,20 +1136,27 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
11121136

11131137
// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
11141138
// inselt NewVecC, (scalar_op V0, V1), Index
1115-
if (IsCmp)
1139+
if (isa<CmpInst>(I))
11161140
++NumScalarCmp;
1117-
else
1141+
else if (isa<BinaryOperator>(I))
11181142
++NumScalarBO;
1143+
else if (isa<IntrinsicInst>(I))
1144+
++NumScalarIntrinsic;
11191145

11201146
// For constant cases, extract the scalar element, this should constant fold.
11211147
if (IsConst0)
11221148
V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
11231149
if (IsConst1)
11241150
V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
11251151

1126-
Value *Scalar =
1127-
IsCmp ? Builder.CreateCmp(Pred, V0, V1)
1128-
: Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
1152+
Value *Scalar;
1153+
if (isa<CmpInst>(I))
1154+
Scalar = Builder.CreateCmp(Pred, V0, V1);
1155+
else if (isa<BinaryOperator>(I))
1156+
Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
1157+
else
1158+
Scalar = Builder.CreateIntrinsic(
1159+
ScalarTy, cast<IntrinsicInst>(I).getIntrinsicID(), {V0, V1});
11291160

11301161
Scalar->setName(I.getName() + ".scalar");
11311162

@@ -1135,9 +1166,14 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
11351166
ScalarInst->copyIRFlags(&I);
11361167

11371168
// Fold the vector constants in the original vectors into a new base vector.
1138-
Value *NewVecC =
1139-
IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
1140-
: Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
1169+
Value *NewVecC;
1170+
if (isa<CmpInst>(I))
1171+
NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1);
1172+
else if (isa<BinaryOperator>(I))
1173+
NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
1174+
else
1175+
NewVecC = Builder.CreateIntrinsic(
1176+
VecTy, cast<IntrinsicInst>(I).getIntrinsicID(), {VecC0, VecC1});
11411177
Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
11421178
replaceValue(I, *Insert);
11431179
return true;
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -S -p vector-combine -mtriple=x86_64 -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
3+
; RUN: opt < %s -S -p vector-combine -mtriple=x86_64 -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
4+
5+
define <2 x float> @maxnum(float %x, float %y) {
6+
; SSE2-LABEL: define <2 x float> @maxnum(
7+
; SSE2-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
8+
; SSE2-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> poison, float [[X]], i32 0
9+
; SSE2-NEXT: [[Y_INSERT:%.*]] = insertelement <2 x float> poison, float [[Y]], i32 0
10+
; SSE2-NEXT: [[V:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[X_INSERT]], <2 x float> [[Y_INSERT]])
11+
; SSE2-NEXT: ret <2 x float> [[V]]
12+
;
13+
; AVX2-LABEL: define <2 x float> @maxnum(
14+
; AVX2-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
15+
; AVX2-NEXT: [[V_SCALAR:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]])
16+
; AVX2-NEXT: [[TMP1:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> poison, <2 x float> poison)
17+
; AVX2-NEXT: [[V:%.*]] = insertelement <2 x float> [[TMP1]], float [[V_SCALAR]], i64 0
18+
; AVX2-NEXT: ret <2 x float> [[V]]
19+
;
20+
%x.insert = insertelement <2 x float> poison, float %x, i32 0
21+
%y.insert = insertelement <2 x float> poison, float %y, i32 0
22+
%v = call <2 x float> @llvm.maxnum(<2 x float> %x.insert, <2 x float> %y.insert)
23+
ret <2 x float> %v
24+
}
25+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
26+
; CHECK: {{.*}}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -S -p vector-combine | FileCheck %s
3+
4+
define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
5+
; CHECK-LABEL: define <4 x i32> @umax_fixed(
6+
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
7+
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
8+
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> poison)
9+
; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
10+
; CHECK-NEXT: ret <4 x i32> [[V]]
11+
;
12+
%x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
13+
%y.insert = insertelement <4 x i32> poison, i32 %y, i32 0
14+
%v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> %y.insert)
15+
ret <4 x i32> %v
16+
}
17+
18+
define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
19+
; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable(
20+
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
21+
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
22+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
23+
; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
24+
; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
25+
;
26+
%x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
27+
%y.insert = insertelement <vscale x 4 x i32> poison, i32 %y, i32 0
28+
%v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> %y.insert)
29+
ret <vscale x 4 x i32> %v
30+
}
31+
32+
define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
33+
; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const(
34+
; CHECK-SAME: i32 [[X:%.*]]) {
35+
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 1, i32 [[X]])
36+
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> poison)
37+
; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
38+
; CHECK-NEXT: ret <4 x i32> [[V]]
39+
;
40+
%x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
41+
%v = call <4 x i32> @llvm.umax(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> %x.insert)
42+
ret <4 x i32> %v
43+
}
44+
45+
define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
46+
; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const(
47+
; CHECK-SAME: i32 [[X:%.*]]) {
48+
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 1)
49+
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
50+
; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
51+
; CHECK-NEXT: ret <4 x i32> [[V]]
52+
;
53+
%x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
54+
%v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
55+
ret <4 x i32> %v
56+
}
57+
58+
define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
59+
; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_lhs_const(
60+
; CHECK-SAME: i32 [[X:%.*]]) {
61+
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 42, i32 [[X]])
62+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> poison)
63+
; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
64+
; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
65+
;
66+
%x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
67+
%v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> %x.insert)
68+
ret <vscale x 4 x i32> %v
69+
}
70+
71+
define <vscale x 4 x i32> @umax_scalable_rhs_const(i32 %x) {
72+
; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_rhs_const(
73+
; CHECK-SAME: i32 [[X:%.*]]) {
74+
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 42)
75+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> splat (i32 42))
76+
; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
77+
; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
78+
;
79+
%x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
80+
%v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> splat (i32 42))
81+
ret <vscale x 4 x i32> %v
82+
}
83+
84+
; Shouldn't be scalarized, not a "trivially vectorizable" intrinsic.
85+
define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) {
86+
; CHECK-LABEL: define <4 x i32> @non_trivially_vectorizable(
87+
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
88+
; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
89+
; CHECK-NEXT: [[Y_INSERT:%.*]] = insertelement <8 x i32> poison, i32 [[Y]], i32 0
90+
; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[X_INSERT]], <8 x i32> [[Y_INSERT]])
91+
; CHECK-NEXT: ret <4 x i32> [[V]]
92+
;
93+
%x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
94+
%y.insert = insertelement <8 x i32> poison, i32 %y, i32 0
95+
%v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert)
96+
ret <4 x i32> %v
97+
}
98+
99+
; TODO: We should be able to scalarize this if we preserve the scalar argument.
100+
define <4 x float> @scalar_argument(float %x) {
101+
; CHECK-LABEL: define <4 x float> @scalar_argument(
102+
; CHECK-SAME: float [[X:%.*]]) {
103+
; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i32 0
104+
; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[X_INSERT]], i32 42)
105+
; CHECK-NEXT: ret <4 x float> [[V]]
106+
;
107+
%x.insert = insertelement <4 x float> poison, float %x, i32 0
108+
%v = call <4 x float> @llvm.powi(<4 x float> %x.insert, i32 42)
109+
ret <4 x float> %v
110+
}
111+
112+
define <4 x i2> @scmp(i32 %x) {
113+
; CHECK-LABEL: define <4 x i2> @scmp(
114+
; CHECK-SAME: i32 [[X:%.*]]) {
115+
; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
116+
; CHECK-NEXT: [[V:%.*]] = call <4 x i2> @llvm.scmp.v4i2.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> zeroinitializer)
117+
; CHECK-NEXT: ret <4 x i2> [[V]]
118+
;
119+
%x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
120+
%v = call <4 x i2> @llvm.scmp(<4 x i32> %x.insert, <4 x i32> splat (i32 0))
121+
ret <4 x i2> %v
122+
}

0 commit comments

Comments
 (0)