llvm · NickGuy-Arm · Jul 4, 2024 · Jun 5, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -14250,7 +14250,7 @@ Arguments:
 """"""""""
 The first 4 arguments are similar to ``llvm.instrprof.increment``. The indexing
 is specific to callsites, meaning callsites are indexed from 0, independent from
-the indexes used by the other intrinsics (such as 
+the indexes used by the other intrinsics (such as
 ``llvm.instrprof.increment[.step]``).
 
 The last argument is the called value of the callsite this intrinsic precedes.
@@ -14264,7 +14264,7 @@ a buffer LLVM can use to perform counter increments (i.e. the lowering of
 ``llvm.instrprof.increment[.step]``. The address range following the counter
 buffer, ``<num-counters>`` x ``sizeof(ptr)`` - sized, is expected to contain
 pointers to contexts of functions called from this function ("subcontexts").
-LLVM does not dereference into that memory region, just calculates GEPs. 
+LLVM does not dereference into that memory region, just calculates GEPs.
 
 The lowering of ``llvm.instrprof.callsite`` consists of:
 
@@ -19209,6 +19209,35 @@ will be on any later loop iteration.
 This intrinsic will only return 0 if the input count is also 0. A non-zero input
 count will produce a non-zero result.
 
+'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<8 x i32> %in)
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<16 x i32> %in)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 8 x i32> %in)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %in)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics do an integer
+``ADD`` reduction of subvectors within a vector, returning each scalar result as
+a lane within a vector. The return type is a vector type with an
+element-type of the vector input and a width a factor of the vector input
+(typically either half or quarter).
+
+Arguments:
+""""""""""
+
+The argument to this intrinsic must be a vector of integer values.
+
+
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
@@ -2635,6 +2635,12 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType
                                                      [llvm_anyvector_ty],
                                                      [IntrNoMem]>;
 
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                                       [llvm_anyvector_ty],
+                                                                       [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7914,6 +7914,27 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, Trunc);
     return;
   }
+  case Intrinsic::experimental_vector_partial_reduce_add: {
+    auto DL = getCurSDLoc();
+    auto ReducedTy = EVT::getEVT(I.getType());
+    auto OpNode = getValue(I.getOperand(0));
+    auto Index = DAG.getVectorIdxConstant(0, DL);
+    auto FullTy = OpNode.getValueType();
+
+    auto ResultVector = DAG.getSplat(ReducedTy, DL, DAG.getConstant(0, DL, ReducedTy.getScalarType()));
+    unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements();
+
+    for(unsigned i = 0; i < ScaleFactor; i++) {
+      auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL);
+      auto TargetIndex = DAG.getVectorIdxConstant(i, DL);
+      auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex});
+      N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N);
+      ResultVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {ResultVector, N, TargetIndex});
+    }
+
+    setValue(&I, ResultVector);
+    return;
+  }
   case Intrinsic::experimental_cttz_elts: {
     auto DL = getCurSDLoc();
     SDValue Op = getValue(I.getOperand(0));

diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-vector-interleave=1 %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define void @partial_reduce_add(<vscale x 16 x i8> %wide.load.pre, <vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %index) #0 {
+; CHECK-LABEL: partial_reduce_add:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z2.s, #0, #1
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    mov w8, #2 // =0x2
+; CHECK-NEXT:    ptrue p2.s, vl1
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1w { z5.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    mov z6.s, w8
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z2.s, z4.s
+; CHECK-NEXT:    uaddv d3, p0, z0.s
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    uaddv d7, p0, z1.s
+; CHECK-NEXT:    uaddv d4, p0, z5.s
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    fmov x8, d3
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    mov z1.s, p2/m, w8
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    cmpeq p2.s, p0/z, z2.s, z6.s
+; CHECK-NEXT:    mov z5.s, w8
+; CHECK-NEXT:    fmov x8, d7
+; CHECK-NEXT:    uaddv d3, p0, z3.s
+; CHECK-NEXT:    mov z1.s, p1/m, w8
+; CHECK-NEXT:    fmov x8, d4
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z5.s
+; CHECK-NEXT:    mov z1.s, p2/m, w8
+; CHECK-NEXT:    fmov x8, d3
+; CHECK-NEXT:    mov z1.s, p0/m, w8
+; CHECK-NEXT:    addvl x8, x1, #1
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    orr z0.d, z1.d, z0.d
+; CHECK-NEXT:    cbnz x8, .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %middle.block
+; CHECK-NEXT:    ret
+entry:
+  %2 = call i64 @llvm.vscale.i64()
+  %3 = mul i64 %2, 16
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %4, %vector.body ]
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %1)
+  %4 = or <vscale x 4 x i32> %partial.reduce, %vec.phi
+  %index.next = add i64 %index, %3
+  %5 = icmp eq i64 %index.next, 0
+  br i1 %5, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %6 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %4)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #2
+
+attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }