Skip to content

[VectorCombine] Scalarize binop-like intrinsics #138095

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
May 21, 2025
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 56 additions & 17 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
STATISTIC(NumScalarBO, "Number of scalar binops formed");
STATISTIC(NumScalarCmp, "Number of scalar compares formed");
STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");

static cl::opt<bool> DisableVectorCombine(
"disable-vector-combine", cl::init(false), cl::Hidden,
Expand Down Expand Up @@ -1016,21 +1017,34 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
return true;
}

/// Match a vector binop or compare instruction with at least one inserted
/// scalar operand and convert to scalar binop/cmp followed by insertelement.
/// Match a vector binop, compare or binop-like intrinsic with at least one
/// inserted scalar operand and convert to scalar binop/cmp/intrinsic followed
/// by insertelement.
bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
Value *Ins0, *Ins1;
if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
!match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
return false;
!match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
// TODO: Allow unary and ternary intrinsics
// TODO: Allow intrinsics with different argument types
// TODO: Allow intrinsics with scalar arguments
if (auto *II = dyn_cast<IntrinsicInst>(&I);
II && II->arg_size() == 2 &&
isTriviallyVectorizable(II->getIntrinsicID()) &&
all_of(II->args(),
[&II](Value *Arg) { return Arg->getType() == II->getType(); })) {
Ins0 = II->getArgOperand(0);
Ins1 = II->getArgOperand(1);
} else {
return false;
}
}

// Do not convert the vector condition of a vector select into a scalar
// condition. That may cause problems for codegen because of differences in
// boolean formats and register-file transfers.
// TODO: Can we account for that in the cost model?
bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
if (IsCmp)
if (isa<CmpInst>(I))
for (User *U : I.users())
if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
return false;
Expand Down Expand Up @@ -1085,16 +1099,26 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {

unsigned Opcode = I.getOpcode();
InstructionCost ScalarOpCost, VectorOpCost;
if (IsCmp) {
if (isa<CmpInst>(I)) {
CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
ScalarOpCost = TTI.getCmpSelInstrCost(
Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
VectorOpCost = TTI.getCmpSelInstrCost(
Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
} else {
} else if (isa<BinaryOperator>(I)) {
ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
}
} else if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
IntrinsicCostAttributes ScalarICA(
II->getIntrinsicID(), ScalarTy,
SmallVector<Type *>(II->arg_size(), ScalarTy));
ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
IntrinsicCostAttributes VectorICA(
II->getIntrinsicID(), VecTy,
SmallVector<Type *>(II->arg_size(), VecTy));
VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
} else
llvm_unreachable("Unexpected instrucion type");

// Get cost estimate for the insert element. This cost will factor into
// both sequences.
Expand All @@ -1112,20 +1136,28 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {

// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
// inselt NewVecC, (scalar_op V0, V1), Index
if (IsCmp)
if (isa<CmpInst>(I))
++NumScalarCmp;
else
else if (isa<BinaryOperator>(I))
++NumScalarBO;
else if (isa<IntrinsicInst>(I))
++NumScalarIntrinsic;

// For constant cases, extract the scalar element, this should constant fold.
if (IsConst0)
V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
if (IsConst1)
V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));

Value *Scalar =
IsCmp ? Builder.CreateCmp(Pred, V0, V1)
: Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
Value *Scalar;
if (isa<CmpInst>(I))
Scalar = Builder.CreateCmp(Pred, V0, V1);
else if (isa<BinaryOperator>(I))
Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
else if (auto *II = dyn_cast<IntrinsicInst>(&I))
Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), {V0, V1});
else
llvm_unreachable("Unexpected instruction type");

Scalar->setName(I.getName() + ".scalar");

Expand All @@ -1135,9 +1167,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
ScalarInst->copyIRFlags(&I);

// Fold the vector constants in the original vectors into a new base vector.
Value *NewVecC =
IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
: Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
Value *NewVecC;
if (isa<CmpInst>(I))
NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1);
else if (isa<BinaryOperator>(I))
NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
else if (auto *II = dyn_cast<IntrinsicInst>(&I))
NewVecC =
Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1});
else
llvm_unreachable("Unexpected instruction type");
Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
replaceValue(I, *Insert);
return true;
Expand Down
122 changes: 122 additions & 0 deletions llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -S -p vector-combine | FileCheck %s

define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
; CHECK-LABEL: define <4 x i32> @umax_fixed(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> poison)
; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
; CHECK-NEXT: ret <4 x i32> [[V]]
;
%x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
%y.insert = insertelement <4 x i32> poison, i32 %y, i32 0
%v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> %y.insert)
ret <4 x i32> %v
}

define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
;
%x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
%y.insert = insertelement <vscale x 4 x i32> poison, i32 %y, i32 0
%v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> %y.insert)
ret <vscale x 4 x i32> %v
}

define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const(
; CHECK-SAME: i32 [[X:%.*]]) {
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 1, i32 [[X]])
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> poison)
; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
; CHECK-NEXT: ret <4 x i32> [[V]]
;
%x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
%v = call <4 x i32> @llvm.umax(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> %x.insert)
ret <4 x i32> %v
}

define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const(
; CHECK-SAME: i32 [[X:%.*]]) {
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 1)
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
; CHECK-NEXT: ret <4 x i32> [[V]]
;
%x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
%v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
ret <4 x i32> %v
}

define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_lhs_const(
; CHECK-SAME: i32 [[X:%.*]]) {
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 42, i32 [[X]])
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> poison)
; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
;
%x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
%v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> %x.insert)
ret <vscale x 4 x i32> %v
}

define <vscale x 4 x i32> @umax_scalable_rhs_const(i32 %x) {
; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_rhs_const(
; CHECK-SAME: i32 [[X:%.*]]) {
; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 42)
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> splat (i32 42))
; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
;
%x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
%v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> splat (i32 42))
ret <vscale x 4 x i32> %v
}

; Shouldn't be scalarized, not a "trivially vectorizable" intrinsic.
define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) {
; CHECK-LABEL: define <4 x i32> @non_trivially_vectorizable(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
; CHECK-NEXT: [[Y_INSERT:%.*]] = insertelement <8 x i32> poison, i32 [[Y]], i32 0
; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[X_INSERT]], <8 x i32> [[Y_INSERT]])
; CHECK-NEXT: ret <4 x i32> [[V]]
;
%x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
%y.insert = insertelement <8 x i32> poison, i32 %y, i32 0
%v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert)
ret <4 x i32> %v
}

; TODO: We should be able to scalarize this if we preserve the scalar argument.
define <4 x float> @scalar_argument(float %x) {
; CHECK-LABEL: define <4 x float> @scalar_argument(
; CHECK-SAME: float [[X:%.*]]) {
; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i32 0
; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[X_INSERT]], i32 42)
; CHECK-NEXT: ret <4 x float> [[V]]
;
%x.insert = insertelement <4 x float> poison, float %x, i32 0
%v = call <4 x float> @llvm.powi(<4 x float> %x.insert, i32 42)
ret <4 x float> %v
}

define <4 x i2> @scmp(i32 %x) {
; CHECK-LABEL: define <4 x i2> @scmp(
; CHECK-SAME: i32 [[X:%.*]]) {
; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
; CHECK-NEXT: [[V:%.*]] = call <4 x i2> @llvm.scmp.v4i2.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> zeroinitializer)
; CHECK-NEXT: ret <4 x i2> [[V]]
;
%x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
%v = call <4 x i2> @llvm.scmp(<4 x i32> %x.insert, <4 x i32> splat (i32 0))
ret <4 x i2> %v
}