[VectorCombine] Scalarize binop-like intrinsics (llvm#138095)

lukel97 · kostasalv · commit 3ae2bf2017b9 · 2025-05-21T15:03:26.000+01:00
Currently VectorCombine can scalarize vector compares and binary ops. This extends it to also scalarize binary-op like intrinsics like umax, minnum etc. The motivation behind this is to scalarize more intrinsics in VectorCombine rather than in DAGCombine, so we can sink splats across basic blocks: see llvm#137786 This currently has very little effect on generated code because InstCombine doesn't yet canonicalize binary intrinsics where one operand is a constant into the form that VectorCombine expects, i.e. `binop (shuffle insert) const --> shuffle (binop insert const)`. The plan is to land this first and then in a subsequent patch teach InstCombine to do the canonicalization to avoid regressions in the meantime. This uses `isTriviallyVectorizable` to determine whether or not an intrinsic is safe to scalarize. There's also `isTriviallyScalarizable`, but this seems more geared towards the Scalarizer pass and includes intrinsics with multiple return values. It also only handles intrinsics with two operands with the same type as the return type. In the future we would generalize this to handle arbitrary numbers of operands, including unary operators too, e.g. fneg or fma, as well as different operand types, e.g. powi or scmp
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -48,6 +48,7 @@ STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
 STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
 STATISTIC(NumScalarBO, "Number of scalar binops formed");
 STATISTIC(NumScalarCmp, "Number of scalar compares formed");
+STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
 
 static cl::opt<bool> DisableVectorCombine(
     "disable-vector-combine", cl::init(false), cl::Hidden,
@@ -1016,21 +1017,34 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
   return true;
 }
 
-/// Match a vector binop or compare instruction with at least one inserted
-/// scalar operand and convert to scalar binop/cmp followed by insertelement.
+/// Match a vector binop, compare or binop-like intrinsic with at least one
+/// inserted scalar operand and convert to scalar binop/cmp/intrinsic followed
+/// by insertelement.
 bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
   Value *Ins0, *Ins1;
   if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
-      !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
-    return false;
+      !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
+    // TODO: Allow unary and ternary intrinsics
+    // TODO: Allow intrinsics with different argument types
+    // TODO: Allow intrinsics with scalar arguments
+    if (auto *II = dyn_cast<IntrinsicInst>(&I);
+        II && II->arg_size() == 2 &&
+        isTriviallyVectorizable(II->getIntrinsicID()) &&
+        all_of(II->args(),
+               [&II](Value *Arg) { return Arg->getType() == II->getType(); })) {
+      Ins0 = II->getArgOperand(0);
+      Ins1 = II->getArgOperand(1);
+    } else {
+      return false;
+    }
+  }
 
   // Do not convert the vector condition of a vector select into a scalar
   // condition. That may cause problems for codegen because of differences in
   // boolean formats and register-file transfers.
   // TODO: Can we account for that in the cost model?
-  bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
-  if (IsCmp)
+  if (isa<CmpInst>(I))
     for (User *U : I.users())
       if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
         return false;
@@ -1085,15 +1099,25 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   unsigned Opcode = I.getOpcode();
   InstructionCost ScalarOpCost, VectorOpCost;
-  if (IsCmp) {
+  if (isa<CmpInst>(I)) {
     CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
     ScalarOpCost = TTI.getCmpSelInstrCost(
         Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
     VectorOpCost = TTI.getCmpSelInstrCost(
         Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
-  } else {
+  } else if (isa<BinaryOperator>(I)) {
     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
+  } else {
+    auto *II = cast<IntrinsicInst>(&I);
+    IntrinsicCostAttributes ScalarICA(
+        II->getIntrinsicID(), ScalarTy,
+        SmallVector<Type *>(II->arg_size(), ScalarTy));
+    ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
+    IntrinsicCostAttributes VectorICA(
+        II->getIntrinsicID(), VecTy,
+        SmallVector<Type *>(II->arg_size(), VecTy));
+    VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
   }
 
   // Get cost estimate for the insert element. This cost will factor into
@@ -1112,20 +1136,27 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
   // inselt NewVecC, (scalar_op V0, V1), Index
-  if (IsCmp)
+  if (isa<CmpInst>(I))
     ++NumScalarCmp;
-  else
+  else if (isa<BinaryOperator>(I))
     ++NumScalarBO;
+  else if (isa<IntrinsicInst>(I))
+    ++NumScalarIntrinsic;
 
   // For constant cases, extract the scalar element, this should constant fold.
   if (IsConst0)
     V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
   if (IsConst1)
     V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
 
-  Value *Scalar =
-      IsCmp ? Builder.CreateCmp(Pred, V0, V1)
-            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+  Value *Scalar;
+  if (isa<CmpInst>(I))
+    Scalar = Builder.CreateCmp(Pred, V0, V1);
+  else if (isa<BinaryOperator>(I))
+    Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+  else
+    Scalar = Builder.CreateIntrinsic(
+        ScalarTy, cast<IntrinsicInst>(I).getIntrinsicID(), {V0, V1});
 
   Scalar->setName(I.getName() + ".scalar");
 
@@ -1135,9 +1166,14 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
     ScalarInst->copyIRFlags(&I);
 
   // Fold the vector constants in the original vectors into a new base vector.
-  Value *NewVecC =
-      IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
-            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+  Value *NewVecC;
+  if (isa<CmpInst>(I))
+    NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1);
+  else if (isa<BinaryOperator>(I))
+    NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+  else
+    NewVecC = Builder.CreateIntrinsic(
+        VecTy, cast<IntrinsicInst>(I).getIntrinsicID(), {VecC0, VecC1});
   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
   replaceValue(I, *Insert);
   return true;
diff --git a/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -p vector-combine -mtriple=x86_64 -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -S -p vector-combine -mtriple=x86_64 -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+
+define <2 x float> @maxnum(float %x, float %y) {
+; SSE2-LABEL: define <2 x float> @maxnum(
+; SSE2-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE2-NEXT:    [[X_INSERT:%.*]] = insertelement <2 x float> poison, float [[X]], i32 0
+; SSE2-NEXT:    [[Y_INSERT:%.*]] = insertelement <2 x float> poison, float [[Y]], i32 0
+; SSE2-NEXT:    [[V:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[X_INSERT]], <2 x float> [[Y_INSERT]])
+; SSE2-NEXT:    ret <2 x float> [[V]]
+;
+; AVX2-LABEL: define <2 x float> @maxnum(
+; AVX2-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX2-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]])
+; AVX2-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> poison, <2 x float> poison)
+; AVX2-NEXT:    [[V:%.*]] = insertelement <2 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; AVX2-NEXT:    ret <2 x float> [[V]]
+;
+  %x.insert = insertelement <2 x float> poison, float %x, i32 0
+  %y.insert = insertelement <2 x float> poison, float %y, i32 0
+  %v = call <2 x float> @llvm.maxnum(<2 x float> %x.insert, <2 x float> %y.insert)
+  ret <2 x float> %v
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -p vector-combine | FileCheck %s
+
+define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %y.insert = insertelement <4 x i32> poison, i32 %y, i32 0
+  %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> %y.insert)
+  ret <4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %y.insert = insertelement <vscale x 4 x i32> poison, i32 %y, i32 0
+  %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> %y.insert)
+  ret <vscale x 4 x i32> %v
+}
+
+define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 1, i32 [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %v = call <4 x i32> @llvm.umax(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> %x.insert)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+  ret <4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_lhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 42, i32 [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> %x.insert)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable_rhs_const(i32 %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_rhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 42)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> splat (i32 42))
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> splat (i32 42))
+  ret <vscale x 4 x i32> %v
+}
+
+; Shouldn't be scalarized, not a "trivially vectorizable" intrinsic.
+define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @non_trivially_vectorizable(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[Y_INSERT:%.*]] = insertelement <8 x i32> poison, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[X_INSERT]], <8 x i32> [[Y_INSERT]])
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %y.insert = insertelement <8 x i32> poison, i32 %y, i32 0
+  %v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert)
+  ret <4 x i32> %v
+}
+
+; TODO: We should be able to scalarize this if we preserve the scalar argument.
+define <4 x float> @scalar_argument(float %x) {
+; CHECK-LABEL: define <4 x float> @scalar_argument(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[X_INSERT]], i32 42)
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+  %x.insert = insertelement <4 x float> poison, float %x, i32 0
+  %v = call <4 x float> @llvm.powi(<4 x float> %x.insert, i32 42)
+  ret <4 x float> %v
+}
+
+define <4 x i2> @scmp(i32 %x) {
+; CHECK-LABEL: define <4 x i2> @scmp(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x i2> @llvm.scmp.v4i2.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <4 x i2> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %v = call <4 x i2> @llvm.scmp(<4 x i32> %x.insert, <4 x i32> splat (i32 0))
+  ret <4 x i2> %v
+}