llvm · NickGuy-Arm · Jul 4, 2024 · Jun 5, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -19209,6 +19209,38 @@ will be on any later loop iteration.
 This intrinsic will only return 0 if the input count is also 0. A non-zero input
 count will produce a non-zero result.
 
+'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accum, <8 x i32> %in)
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %accum, <16 x i32> %in)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accum, <vscale x 8 x i32> %in)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accum, <vscale x 16 x i32> %in)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the
+input vector down to the number of elements dictated by the result vector, and
+then adds the resulting vector to the accumulator vector. The return type is a
+vector type that matches the type of the accumulator vector.
+
+Arguments:
+""""""""""
+
+The first argument is the accumulator vector. The type of this argument must match the
+return type. The second argument is the vector to reduceinto the accumulator, the length
+of this vector must be a positive integer multiple of the accumulator vector/return type.
+The arguments must be either be both fixed or both scalable vectors, and must have
+matching element types.
+
+
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
@@ -2635,6 +2635,12 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType
                                                      [llvm_anyvector_ty],
                                                      [IntrNoMem]>;
 
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+                                                                       [llvm_anyvector_ty, llvm_anyvector_ty],
+                                                                       [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7914,6 +7914,28 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, Trunc);
     return;
   }
+  case Intrinsic::experimental_vector_partial_reduce_add: {
+    auto DL = getCurSDLoc();
+    auto ReducedTy = EVT::getEVT(I.getType());
+    auto OpNode = getValue(I.getOperand(1));
+    auto FullTy = OpNode.getValueType();
+
+    auto Accumulator = getValue(I.getOperand(0));
+    unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements();
+
+    for(unsigned i = 0; i < ScaleFactor; i++) {
+      auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL);
+      auto TargetIndex = DAG.getVectorIdxConstant(i, DL);
+      auto ExistingValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ReducedTy.getScalarType(), {Accumulator, TargetIndex});
+      auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex});
+      N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N);
+      N = DAG.getNode(ISD::ADD, DL, ReducedTy.getScalarType(), ExistingValue, N);
+      Accumulator = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {Accumulator, N, TargetIndex});
+    }
+
+    setValue(&I, Accumulator);
+    return;
+  }
   case Intrinsic::experimental_cttz_elts: {
     auto DL = getCurSDLoc();
     SDValue Op = getValue(I.getOperand(0));

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
@@ -6131,6 +6131,19 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::experimental_vector_partial_reduce_add: {
+    VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+    VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
+
+    auto VecWidth = VecTy->getElementCount().getKnownMinValue();
+    auto AccWidth = AccTy->getElementCount().getKnownMinValue();
+
+    Check((VecWidth % AccWidth) == 0, "Invalid vector widths for partial "
+                                      "reduction. The width of the input vector "
+                                      "must be a postive integer multiple of "
+                                      "the width of the accumulator vector.");
+    break;
+  }
   case Intrinsic::experimental_noalias_scope_decl: {
     NoAliasScopeDecls.push_back(cast<IntrinsicInst>(&Call));
     break;

diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-vector-interleave=1 %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv s1, v1.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    mov v0.s[0], w8
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0)
+  ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv s1, v1.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov w10, v0.s[1]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    mov v0.s[0], w9
+; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
+  ret <4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    uaddv d1, p0, z1.s
+; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    mov z0.s, p0/m, w8
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z2.s, #0, #1
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov w9, v0.s[1]
+; CHECK-NEXT:    uaddv d1, p0, z1.s
+; CHECK-NEXT:    ptrue p1.s, vl1
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    add w10, w10, w8
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    mov z0.s, p1/m, w10
+; CHECK-NEXT:    mov z0.s, p0/m, w8
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_quart:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov z6.s, w8
+; CHECK-NEXT:    index z5.s, #0, #1
+; CHECK-NEXT:    ptrue p2.s, vl1
+; CHECK-NEXT:    uaddv d1, p0, z1.s
+; CHECK-NEXT:    mov w9, v0.s[1]
+; CHECK-NEXT:    uaddv d2, p0, z2.s
+; CHECK-NEXT:    uaddv d3, p0, z3.s
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z5.s, z6.s
+; CHECK-NEXT:    uaddv d4, p0, z4.s
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    mov w10, #2 // =0x2
+; CHECK-NEXT:    mov z1.s, p2/m, w8
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    mov z6.s, w10
+; CHECK-NEXT:    mov w10, v0.s[2]
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    cmpeq p2.s, p0/z, z5.s, z6.s
+; CHECK-NEXT:    mov z2.s, w9
+; CHECK-NEXT:    fmov x9, d3
+; CHECK-NEXT:    mov z1.s, p1/m, w8
+; CHECK-NEXT:    mov w8, v0.s[3]
+; CHECK-NEXT:    add w9, w10, w9
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z5.s, z2.s
+; CHECK-NEXT:    mov z1.s, p2/m, w9
+; CHECK-NEXT:    fmov x9, d4
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    mov z1.s, p0/m, w8
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half_8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z2.s, z2.s, z3.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z3.s, #0, #1
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov w9, v0.s[1]
+; CHECK-NEXT:    ptrue p1.s, vl1
+; CHECK-NEXT:    uaddv d2, p0, z2.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z3.s, z4.s
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    add w10, w10, w8
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    mov z0.s, p1/m, w10
+; CHECK-NEXT:    mov z0.s, p0/m, w8
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
+  ret <vscale x 8 x i32> %partial.reduce
+}
+
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>)
+
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>)
+
+attributes #0 = { "target-features"="+sve2" }