-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[RISCV] Lower shuffle which splats a single span (without exact VLEN) #127108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
If we have a shuffle which repeats the same pattern of elements, all of which come from the first register in the source register group, we can lower this to a single vrgather at m1 to perform the element rearrangement, and reuse that for each register in the result vector register group.
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesIf we have a shuffle which repeats the same pattern of elements, all of which come from the first register in the source register group, we can lower this to a single vrgather at m1 to perform the element rearrangement, and reuse that for each register in the result vector register group. Full diff: https://github.com/llvm/llvm-project/pull/127108.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 829eef2e4d9d9..1156fd2e67fed 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5360,6 +5360,23 @@ static bool isLowSourceShuffle(ArrayRef<int> Mask, int Span) {
[&](const auto &Idx) { return Idx == -1 || Idx < Span; });
}
+/// Return true for a mask which performs an arbitrary shuffle within the first
+/// span, and then repeats that same result across all remaining spans. Note
+/// that this doesn't check if all the inputs come from a single span!
+static bool isSpanSplatShuffle(ArrayRef<int> Mask, int Span) {
+ SmallVector<int> LowSpan(Span, -1);
+ for (auto [I, M] : enumerate(Mask)) {
+ if (M == -1)
+ continue;
+ int SpanIdx = I % Span;
+ if (LowSpan[SpanIdx] == -1)
+ LowSpan[SpanIdx] = M;
+ if (LowSpan[SpanIdx] != M)
+ return false;
+ }
+ return true;
+}
+
/// Try to widen element type to get a new mask value for a better permutation
/// sequence. This doesn't try to inspect the widened mask for profitability;
/// we speculate the widened form is equal or better. This has the effect of
@@ -5775,6 +5792,37 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather,
SubVec, SubIdx);
}
+ } else if (NumElts > MinVLMAX && isLowSourceShuffle(Mask, MinVLMAX) &&
+ isSpanSplatShuffle(Mask, MinVLMAX)) {
+ // If we have a shuffle which only uses the first register in our source
+ // register group, and repeats the same index across all spans, we can
+ // use a single vrgather (and possibly some register moves).
+ // TODO: This can be generalized for m2 or m4, or for any shuffle for
+ // which we can do a linear number of shuffles to form an m1 which
+ // contains all the output elements.
+ const MVT M1VT = getLMUL1VT(ContainerVT);
+ EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType());
+ auto [InnerTrueMask, InnerVL] =
+ getDefaultScalableVLOps(M1VT, DL, DAG, Subtarget);
+ int N = ContainerVT.getVectorMinNumElements() /
+ M1VT.getVectorMinNumElements();
+ assert(isPowerOf2_32(N) && N <= 8);
+ SDValue SubV1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1,
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue SubIndex =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices,
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue SubVec =
+ DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex,
+ DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL);
+ Gather = DAG.getUNDEF(ContainerVT);
+ for (int i = 0; i < N; i++) {
+ SDValue SubIdx =
+ DAG.getVectorIdxConstant(M1VT.getVectorMinNumElements() * i, DL);
+ Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather,
+ SubVec, SubIdx);
+ }
} else if (NumElts > MinVLMAX && isLowSourceShuffle(Mask, MinVLMAX)) {
// If we have a shuffle which only uses the first register in our
// source register group, we can do a linear number of m1 vrgathers
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index d7120b4a16938..3e31c9de61657 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -1311,22 +1311,14 @@ define void @shuffle_i128_splat(ptr %p) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: lui a2, 16
-; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: lui a1, 16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a2
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vx v10, v9, a1
-; CHECK-NEXT: vslidedown.vx v11, v10, a1
-; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT: vrgatherei16.vv v13, v8, v10
-; CHECK-NEXT: vrgatherei16.vv v12, v8, v9
-; CHECK-NEXT: vrgatherei16.vv v14, v8, v11
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vx v9, v11, a1
+; CHECK-NEXT: vmv.v.x v9, a1
; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-NEXT: vrgatherei16.vv v15, v8, v9
+; CHECK-NEXT: vrgatherei16.vv v12, v8, v9
+; CHECK-NEXT: vmv.v.v v13, v12
+; CHECK-NEXT: vmv.v.v v14, v12
+; CHECK-NEXT: vmv.v.v v15, v12
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vse64.v v12, (a0)
; CHECK-NEXT: ret
@@ -1435,3 +1427,20 @@ define <4 x i16> @vmerge_3(<4 x i16> %x) {
%s = shufflevector <4 x i16> %x, <4 x i16> <i16 poison, i16 5, i16 poison, i16 poison>, <4 x i32> <i32 0, i32 5, i32 5, i32 3>
ret <4 x i16> %s
}
+
+
+define <8 x i64> @shuffle_v8i164_span_splat(<8 x i64> %a) nounwind {
+; CHECK-LABEL: shuffle_v8i164_span_splat:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 1
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v12, v8, v9
+; CHECK-NEXT: vmv.v.v v13, v12
+; CHECK-NEXT: vmv.v.v v14, v12
+; CHECK-NEXT: vmv.v.v v15, v12
+; CHECK-NEXT: vmv4r.v v8, v12
+; CHECK-NEXT: ret
+ %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> <i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>
+ ret <8 x i64> %res
+}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…llvm#127108) If we have a shuffle which repeats the same pattern of elements, all of which come from the first register in the source register group, we can lower this to a single vrgather at m1 to perform the element rearrangement, and reuse that for each register in the result vector register group.
If we have a shuffle which repeats the same pattern of elements, all of which come from the first register in the source register group, we can lower this to a single vrgather at m1 to perform the element rearrangement, and reuse that for each register in the result vector register group.