Skip to content

[RISCV] Merge shuffle sources if lanes are disjoint #119401

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5197,6 +5197,67 @@ static bool isCompressMask(ArrayRef<int> Mask) {
return true;
}

/// Given a shuffle where the indices are disjoint between the two sources,
/// e.g.:
///
/// t2:v4i8 = vector_shuffle t0:v4i8, t1:v4i8, <2, 7, 1, 4>
///
/// Merge the two sources into one and do a single source shuffle:
///
/// t2:v4i8 = vselect t1:v4i8, t0:v4i8, <0, 1, 0, 1>
/// t3:v4i8 = vector_shuffle t2:v4i8, undef, <2, 3, 1, 0>
///
/// A vselect will either be merged into a masked instruction or be lowered as a
/// vmerge.vvm, which is cheaper than a vrgather.vv.
static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
MVT VT = SVN->getSimpleValueType(0);
MVT XLenVT = Subtarget.getXLenVT();
SDLoc DL(SVN);

const ArrayRef<int> Mask = SVN->getMask();

// Work out which source each lane will come from.
SmallVector<int, 16> Srcs(Mask.size(), -1);

for (int Idx : Mask) {
if (Idx == -1)
continue;
unsigned SrcIdx = Idx % Mask.size();
int Src = (uint32_t)Idx < Mask.size() ? 0 : 1;
if (Srcs[SrcIdx] == -1)
// Mark this source as using this lane.
Srcs[SrcIdx] = Src;
else if (Srcs[SrcIdx] != Src)
// The other source is using this lane: not disjoint.
return SDValue();
}

SmallVector<SDValue> SelectMaskVals;
for (int Lane : Srcs) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you reverse the select here so that it uses the same order as the generic fallthrough below? From prior experience, our vmerge.vxm matching is oddly fragile. I'd like to remove this unrelated change if possible.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I've reversed it to match the generic fallthrough when SwapOps is false. But it's worth pointing out that this simultaneously removes some mask diffs and introduces some others too

if (Lane == -1)
SelectMaskVals.push_back(DAG.getUNDEF(XLenVT));
else
SelectMaskVals.push_back(DAG.getConstant(Lane ? 0 : 1, DL, XLenVT));
}
MVT MaskVT = VT.changeVectorElementType(MVT::i1);
SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, SelectMaskVals);
SDValue Select = DAG.getNode(ISD::VSELECT, DL, VT, SelectMask,
SVN->getOperand(0), SVN->getOperand(1));

// Move all indices relative to the first source.
SmallVector<int> NewMask(Mask.size());
for (unsigned I = 0; I < Mask.size(); I++) {
if (Mask[I] == -1)
NewMask[I] = -1;
else
NewMask[I] = Mask[I] % Mask.size();
}

return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask);
}

static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
SDValue V1 = Op.getOperand(0);
Expand Down Expand Up @@ -5540,6 +5601,17 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts));
}

// If the mask indices are disjoint between the two sources, we can lower it
// as a vselect + a single source vrgather.vv. Don't do this if we think the
// operands may end up being lowered to something cheaper than a vrgather.vv.
if (!DAG.isSplatValue(V2) && !DAG.isSplatValue(V1) &&
!ShuffleVectorSDNode::isSplatMask(ShuffleMaskLHS.data(), VT) &&
!ShuffleVectorSDNode::isSplatMask(ShuffleMaskRHS.data(), VT) &&
!ShuffleVectorInst::isIdentityMask(ShuffleMaskLHS, NumElts) &&
!ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts))
if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget))
return V;

// Try to pick a profitable operand order.
bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
SwapOps = SwapOps ^ ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts);
Expand Down
73 changes: 73 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -395,3 +395,76 @@ define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) {
%s = shufflevector <4 x half> %v, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x half> %s
}

define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI30_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vle8.v v16, (a0)
; CHECK-NEXT: lui a0, 11
; CHECK-NEXT: addi a0, a0, -1366
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vsext.vf2 v18, v16
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
; CHECK-NEXT: ret
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
ret <16 x float> %out
}

define <16 x float> @shuffle_disjoint_lanes_one_identity(<16 x float> %v, <16 x float> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes_one_identity:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI31_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vle16.v v16, (a0)
; CHECK-NEXT: li a0, -272
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
; CHECK-NEXT: ret
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
ret <16 x float> %out
}

define <16 x float> @shuffle_disjoint_lanes_one_broadcast(<16 x float> %v, <16 x float> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes_one_broadcast:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI32_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI32_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vle16.v v20, (a0)
; CHECK-NEXT: lui a0, 15
; CHECK-NEXT: addi a0, a0, 240
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vrgather.vi v16, v8, 7
; CHECK-NEXT: vrgatherei16.vv v16, v12, v20, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test isn't exactly convincing that we want to avoid the splat case. I think maybe this hints at the difference between a splatvalue (from scalar) and the splat shuffle (from mask). The later requires a vrgather.vi, whereas the former is a vmerge.vxm. Maybe rework the tests to cover both cases separately?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's a diff from a regression if we remove the isSplatValue guard:

 ; CHECK-LABEL: shuffle_vx_v4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 6
-; CHECK-NEXT:    vmerge.vim v8, v8, 5, v0
+; CHECK-NEXT:    vmv.v.i v0, 9
+; CHECK-NEXT:    vmv.v.i v9, 5
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    ret
   %s = shufflevector <4 x i16> %x, <4 x i16> <i16 5, i16 5, i16 5, i16 5>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   ret <4 x i16> %s

And here's one if remove the isSplatMask guard:

--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -364,12 +364,14 @@ define <8 x i8> @splat_ve4_ins_i1ve3(<8 x i8> %v) {
 define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    li a0, 66
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vmerge.vvm v9, v9, v8, v0
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vi v10, v8, 2
-; CHECK-NEXT:    vrgather.vi v10, v9, 0, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vmv.v.i v8, 2
+; CHECK-NEXT:    vmerge.vim v10, v8, 0, v0
+; CHECK-NEXT:    vrgather.vv v8, v9, v10
 ; CHECK-NEXT:    ret
   %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 2, i32 2, i32 2, i32 2, i32 8, i32 2>
   ret <8 x i8> %shuff

I think ideally we would really only want to do this lowering when we know we're going to end up with at least a vrgather.vv. Maybe in a follow up this could be reworked as a late combine on VRGATHER_VV_VL instead? It might be a bit trickier because you would need to reconstruct the original mask.

ret <16 x float> %out
}

define <16 x float> @shuffle_disjoint_lanes_one_splat(float %v, <16 x float> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI33_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI33_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vle16.v v16, (a0)
; CHECK-NEXT: lui a0, 15
; CHECK-NEXT: addi a0, a0, 240
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vfmv.v.f v12, fa0
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%head = insertelement <16 x float> poison, float %v, i32 0
%splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
%out = shufflevector <16 x float> %splat, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
ret <16 x float> %out
}
100 changes: 83 additions & 17 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -451,21 +451,14 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: splat_ve2_we0_ins_i2ve4_i5we6:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI26_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0)
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vmv.v.i v10, 6
; CHECK-NEXT: vmv.v.i v11, 0
; CHECK-NEXT: lui a0, 8256
; CHECK-NEXT: addi a0, a0, 2
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vmv.v.x v12, a0
; CHECK-NEXT: li a0, 98
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v11, v10, 5
; CHECK-NEXT: vle8.v v10, (a0)
; CHECK-NEXT: li a0, 20
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vrgather.vv v10, v8, v12
; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0
; CHECK-NEXT: vrgather.vv v8, v9, v10
; CHECK-NEXT: ret
%shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 2, i32 2, i32 14, i32 8, i32 2>
ret <8 x i8> %shuff
Expand Down Expand Up @@ -693,12 +686,12 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI46_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vle8.v v10, (a0)
; CHECK-NEXT: li a0, -22
; CHECK-NEXT: li a0, 84
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0
; CHECK-NEXT: vrgather.vv v8, v9, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
ret <8 x i8> %res
Expand Down Expand Up @@ -1073,3 +1066,76 @@ define <16 x i64> @shuffle_zipodd_v16i64(<16 x i64> %v1, <16 x i64> %v2) {
%out = shufflevector <16 x i64> %v1, <16 x i64> %v2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
ret <16 x i64> %out
}

define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI74_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI74_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vle8.v v16, (a0)
; CHECK-NEXT: lui a0, 11
; CHECK-NEXT: addi a0, a0, -1366
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vsext.vf2 v18, v16
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
; CHECK-NEXT: ret
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
ret <16 x i32> %out
}

define <16 x i32> @shuffle_disjoint_lanes_one_identity(<16 x i32> %v, <16 x i32> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes_one_identity:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI75_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI75_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vle16.v v16, (a0)
; CHECK-NEXT: li a0, -272
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
; CHECK-NEXT: ret
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
ret <16 x i32> %out
}

define <16 x i32> @shuffle_disjoint_lanes_one_broadcast(<16 x i32> %v, <16 x i32> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes_one_broadcast:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI76_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI76_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vle16.v v20, (a0)
; CHECK-NEXT: lui a0, 15
; CHECK-NEXT: addi a0, a0, 240
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vrgather.vi v16, v8, 7
; CHECK-NEXT: vrgatherei16.vv v16, v12, v20, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16>
ret <16 x i32> %out
}

define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI77_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI77_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vle16.v v16, (a1)
; CHECK-NEXT: lui a1, 15
; CHECK-NEXT: addi a1, a1, 240
; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: vmv.v.x v12, a0
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%head = insertelement <16 x i32> poison, i32 %v, i32 0
%splat = shufflevector <16 x i32> %head, <16 x i32> poison, <16 x i32> zeroinitializer
%out = shufflevector <16 x i32> %splat, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
ret <16 x i32> %out
}
Loading
Loading