Skip to content

Commit 088db86

Browse files
authored
[RISCV] Merge shuffle sources if lanes are disjoint (#119401)
In x264, there's a few kernels with shuffles like this: %41 = add nsw <16 x i32> %39, %40 %42 = sub nsw <16 x i32> %39, %40 %43 = shufflevector <16 x i32> %41, <16 x i32> %42, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16> Because this is a complex two-source shuffle, this will get lowered as two vrgather.vvs that are blended together. vadd.vv v20, v16, v12 vsub.vv v12, v16, v12 vrgatherei16.vv v24, v20, v10 vrgatherei16.vv v24, v12, v16, v0.t However the indices coming from each source are disjoint, so we can blend the two together and perform a single source shuffle instead: %41 = add nsw <16 x i32> %39, %40 %42 = sub nsw <16 x i32> %39, %40 %43 = select <0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1> %41, %42 %44 = shufflevector <16 x i32> %43, <16 x i32> poison, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 10, i32 14, i32 6, i32 2, i32 9, i32 13, i32 5, i32 1, i32 8, i32 12, i32 4, i32 0> The select will likely get merged into the preceding instruction, and then we only have to do one vrgather.vv: vadd.vv v20, v16, v12 vsub.vv v20, v16, v12, v0.t vrgatherei16.vv v24, v20, v10 This patch bails if either of the sources are a broadcast/splat/identity shuffle, since that will usually already have some sort of cheaper lowering. This improves performance on 525.x264_r by 4.12% with -O3 -flto -march=rva22u64_v on the spacemit-x60.
1 parent b26fe5b commit 088db86

File tree

5 files changed

+949
-687
lines changed

5 files changed

+949
-687
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+72
Original file line numberDiff line numberDiff line change
@@ -5197,6 +5197,67 @@ static bool isCompressMask(ArrayRef<int> Mask) {
51975197
return true;
51985198
}
51995199

5200+
/// Given a shuffle where the indices are disjoint between the two sources,
5201+
/// e.g.:
5202+
///
5203+
/// t2:v4i8 = vector_shuffle t0:v4i8, t1:v4i8, <2, 7, 1, 4>
5204+
///
5205+
/// Merge the two sources into one and do a single source shuffle:
5206+
///
5207+
/// t2:v4i8 = vselect t1:v4i8, t0:v4i8, <0, 1, 0, 1>
5208+
/// t3:v4i8 = vector_shuffle t2:v4i8, undef, <2, 3, 1, 0>
5209+
///
5210+
/// A vselect will either be merged into a masked instruction or be lowered as a
5211+
/// vmerge.vvm, which is cheaper than a vrgather.vv.
5212+
static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
5213+
SelectionDAG &DAG,
5214+
const RISCVSubtarget &Subtarget) {
5215+
MVT VT = SVN->getSimpleValueType(0);
5216+
MVT XLenVT = Subtarget.getXLenVT();
5217+
SDLoc DL(SVN);
5218+
5219+
const ArrayRef<int> Mask = SVN->getMask();
5220+
5221+
// Work out which source each lane will come from.
5222+
SmallVector<int, 16> Srcs(Mask.size(), -1);
5223+
5224+
for (int Idx : Mask) {
5225+
if (Idx == -1)
5226+
continue;
5227+
unsigned SrcIdx = Idx % Mask.size();
5228+
int Src = (uint32_t)Idx < Mask.size() ? 0 : 1;
5229+
if (Srcs[SrcIdx] == -1)
5230+
// Mark this source as using this lane.
5231+
Srcs[SrcIdx] = Src;
5232+
else if (Srcs[SrcIdx] != Src)
5233+
// The other source is using this lane: not disjoint.
5234+
return SDValue();
5235+
}
5236+
5237+
SmallVector<SDValue> SelectMaskVals;
5238+
for (int Lane : Srcs) {
5239+
if (Lane == -1)
5240+
SelectMaskVals.push_back(DAG.getUNDEF(XLenVT));
5241+
else
5242+
SelectMaskVals.push_back(DAG.getConstant(Lane ? 0 : 1, DL, XLenVT));
5243+
}
5244+
MVT MaskVT = VT.changeVectorElementType(MVT::i1);
5245+
SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, SelectMaskVals);
5246+
SDValue Select = DAG.getNode(ISD::VSELECT, DL, VT, SelectMask,
5247+
SVN->getOperand(0), SVN->getOperand(1));
5248+
5249+
// Move all indices relative to the first source.
5250+
SmallVector<int> NewMask(Mask.size());
5251+
for (unsigned I = 0; I < Mask.size(); I++) {
5252+
if (Mask[I] == -1)
5253+
NewMask[I] = -1;
5254+
else
5255+
NewMask[I] = Mask[I] % Mask.size();
5256+
}
5257+
5258+
return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask);
5259+
}
5260+
52005261
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
52015262
const RISCVSubtarget &Subtarget) {
52025263
SDValue V1 = Op.getOperand(0);
@@ -5540,6 +5601,17 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
55405601
ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts));
55415602
}
55425603

5604+
// If the mask indices are disjoint between the two sources, we can lower it
5605+
// as a vselect + a single source vrgather.vv. Don't do this if we think the
5606+
// operands may end up being lowered to something cheaper than a vrgather.vv.
5607+
if (!DAG.isSplatValue(V2) && !DAG.isSplatValue(V1) &&
5608+
!ShuffleVectorSDNode::isSplatMask(ShuffleMaskLHS.data(), VT) &&
5609+
!ShuffleVectorSDNode::isSplatMask(ShuffleMaskRHS.data(), VT) &&
5610+
!ShuffleVectorInst::isIdentityMask(ShuffleMaskLHS, NumElts) &&
5611+
!ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts))
5612+
if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget))
5613+
return V;
5614+
55435615
// Try to pick a profitable operand order.
55445616
bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
55455617
SwapOps = SwapOps ^ ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll

+73
Original file line numberDiff line numberDiff line change
@@ -395,3 +395,76 @@ define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) {
395395
%s = shufflevector <4 x half> %v, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
396396
ret <4 x half> %s
397397
}
398+
399+
define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) {
400+
; CHECK-LABEL: shuffle_disjoint_lanes:
401+
; CHECK: # %bb.0:
402+
; CHECK-NEXT: lui a0, %hi(.LCPI30_0)
403+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0)
404+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
405+
; CHECK-NEXT: vle8.v v16, (a0)
406+
; CHECK-NEXT: lui a0, 11
407+
; CHECK-NEXT: addi a0, a0, -1366
408+
; CHECK-NEXT: vmv.s.x v0, a0
409+
; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0
410+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
411+
; CHECK-NEXT: vsext.vf2 v18, v16
412+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
413+
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
414+
; CHECK-NEXT: ret
415+
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
416+
ret <16 x float> %out
417+
}
418+
419+
define <16 x float> @shuffle_disjoint_lanes_one_identity(<16 x float> %v, <16 x float> %w) {
420+
; CHECK-LABEL: shuffle_disjoint_lanes_one_identity:
421+
; CHECK: # %bb.0:
422+
; CHECK-NEXT: lui a0, %hi(.LCPI31_0)
423+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0)
424+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
425+
; CHECK-NEXT: vle16.v v16, (a0)
426+
; CHECK-NEXT: li a0, -272
427+
; CHECK-NEXT: vmv.s.x v0, a0
428+
; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
429+
; CHECK-NEXT: ret
430+
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
431+
ret <16 x float> %out
432+
}
433+
434+
define <16 x float> @shuffle_disjoint_lanes_one_broadcast(<16 x float> %v, <16 x float> %w) {
435+
; CHECK-LABEL: shuffle_disjoint_lanes_one_broadcast:
436+
; CHECK: # %bb.0:
437+
; CHECK-NEXT: lui a0, %hi(.LCPI32_0)
438+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI32_0)
439+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
440+
; CHECK-NEXT: vle16.v v20, (a0)
441+
; CHECK-NEXT: lui a0, 15
442+
; CHECK-NEXT: addi a0, a0, 240
443+
; CHECK-NEXT: vmv.s.x v0, a0
444+
; CHECK-NEXT: vrgather.vi v16, v8, 7
445+
; CHECK-NEXT: vrgatherei16.vv v16, v12, v20, v0.t
446+
; CHECK-NEXT: vmv.v.v v8, v16
447+
; CHECK-NEXT: ret
448+
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16>
449+
ret <16 x float> %out
450+
}
451+
452+
define <16 x float> @shuffle_disjoint_lanes_one_splat(float %v, <16 x float> %w) {
453+
; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
454+
; CHECK: # %bb.0:
455+
; CHECK-NEXT: lui a0, %hi(.LCPI33_0)
456+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI33_0)
457+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
458+
; CHECK-NEXT: vle16.v v16, (a0)
459+
; CHECK-NEXT: lui a0, 15
460+
; CHECK-NEXT: addi a0, a0, 240
461+
; CHECK-NEXT: vmv.s.x v0, a0
462+
; CHECK-NEXT: vfmv.v.f v12, fa0
463+
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t
464+
; CHECK-NEXT: vmv.v.v v8, v12
465+
; CHECK-NEXT: ret
466+
%head = insertelement <16 x float> poison, float %v, i32 0
467+
%splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
468+
%out = shufflevector <16 x float> %splat, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
469+
ret <16 x float> %out
470+
}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

+83-17
Original file line numberDiff line numberDiff line change
@@ -451,21 +451,14 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
451451
define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
452452
; CHECK-LABEL: splat_ve2_we0_ins_i2ve4_i5we6:
453453
; CHECK: # %bb.0:
454+
; CHECK-NEXT: lui a0, %hi(.LCPI26_0)
455+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0)
454456
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
455-
; CHECK-NEXT: vmv.v.i v10, 6
456-
; CHECK-NEXT: vmv.v.i v11, 0
457-
; CHECK-NEXT: lui a0, 8256
458-
; CHECK-NEXT: addi a0, a0, 2
459-
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
460-
; CHECK-NEXT: vmv.v.x v12, a0
461-
; CHECK-NEXT: li a0, 98
462-
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
463-
; CHECK-NEXT: vslideup.vi v11, v10, 5
457+
; CHECK-NEXT: vle8.v v10, (a0)
458+
; CHECK-NEXT: li a0, 20
464459
; CHECK-NEXT: vmv.s.x v0, a0
465-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
466-
; CHECK-NEXT: vrgather.vv v10, v8, v12
467-
; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t
468-
; CHECK-NEXT: vmv1r.v v8, v10
460+
; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0
461+
; CHECK-NEXT: vrgather.vv v8, v9, v10
469462
; CHECK-NEXT: ret
470463
%shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 2, i32 2, i32 14, i32 8, i32 2>
471464
ret <8 x i8> %shuff
@@ -693,12 +686,12 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
693686
; CHECK: # %bb.0:
694687
; CHECK-NEXT: lui a0, %hi(.LCPI46_0)
695688
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
696-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
689+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
697690
; CHECK-NEXT: vle8.v v10, (a0)
698-
; CHECK-NEXT: li a0, -22
691+
; CHECK-NEXT: li a0, 84
699692
; CHECK-NEXT: vmv.s.x v0, a0
700-
; CHECK-NEXT: vslidedown.vi v8, v8, 2
701-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
693+
; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0
694+
; CHECK-NEXT: vrgather.vv v8, v9, v10
702695
; CHECK-NEXT: ret
703696
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
704697
ret <8 x i8> %res
@@ -1073,3 +1066,76 @@ define <16 x i64> @shuffle_zipodd_v16i64(<16 x i64> %v1, <16 x i64> %v2) {
10731066
%out = shufflevector <16 x i64> %v1, <16 x i64> %v2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
10741067
ret <16 x i64> %out
10751068
}
1069+
1070+
define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
1071+
; CHECK-LABEL: shuffle_disjoint_lanes:
1072+
; CHECK: # %bb.0:
1073+
; CHECK-NEXT: lui a0, %hi(.LCPI74_0)
1074+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI74_0)
1075+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1076+
; CHECK-NEXT: vle8.v v16, (a0)
1077+
; CHECK-NEXT: lui a0, 11
1078+
; CHECK-NEXT: addi a0, a0, -1366
1079+
; CHECK-NEXT: vmv.s.x v0, a0
1080+
; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0
1081+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
1082+
; CHECK-NEXT: vsext.vf2 v18, v16
1083+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1084+
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
1085+
; CHECK-NEXT: ret
1086+
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
1087+
ret <16 x i32> %out
1088+
}
1089+
1090+
define <16 x i32> @shuffle_disjoint_lanes_one_identity(<16 x i32> %v, <16 x i32> %w) {
1091+
; CHECK-LABEL: shuffle_disjoint_lanes_one_identity:
1092+
; CHECK: # %bb.0:
1093+
; CHECK-NEXT: lui a0, %hi(.LCPI75_0)
1094+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI75_0)
1095+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
1096+
; CHECK-NEXT: vle16.v v16, (a0)
1097+
; CHECK-NEXT: li a0, -272
1098+
; CHECK-NEXT: vmv.s.x v0, a0
1099+
; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
1100+
; CHECK-NEXT: ret
1101+
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
1102+
ret <16 x i32> %out
1103+
}
1104+
1105+
define <16 x i32> @shuffle_disjoint_lanes_one_broadcast(<16 x i32> %v, <16 x i32> %w) {
1106+
; CHECK-LABEL: shuffle_disjoint_lanes_one_broadcast:
1107+
; CHECK: # %bb.0:
1108+
; CHECK-NEXT: lui a0, %hi(.LCPI76_0)
1109+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI76_0)
1110+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
1111+
; CHECK-NEXT: vle16.v v20, (a0)
1112+
; CHECK-NEXT: lui a0, 15
1113+
; CHECK-NEXT: addi a0, a0, 240
1114+
; CHECK-NEXT: vmv.s.x v0, a0
1115+
; CHECK-NEXT: vrgather.vi v16, v8, 7
1116+
; CHECK-NEXT: vrgatherei16.vv v16, v12, v20, v0.t
1117+
; CHECK-NEXT: vmv.v.v v8, v16
1118+
; CHECK-NEXT: ret
1119+
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16>
1120+
ret <16 x i32> %out
1121+
}
1122+
1123+
define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) {
1124+
; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
1125+
; CHECK: # %bb.0:
1126+
; CHECK-NEXT: lui a1, %hi(.LCPI77_0)
1127+
; CHECK-NEXT: addi a1, a1, %lo(.LCPI77_0)
1128+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
1129+
; CHECK-NEXT: vle16.v v16, (a1)
1130+
; CHECK-NEXT: lui a1, 15
1131+
; CHECK-NEXT: addi a1, a1, 240
1132+
; CHECK-NEXT: vmv.s.x v0, a1
1133+
; CHECK-NEXT: vmv.v.x v12, a0
1134+
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t
1135+
; CHECK-NEXT: vmv.v.v v8, v12
1136+
; CHECK-NEXT: ret
1137+
%head = insertelement <16 x i32> poison, i32 %v, i32 0
1138+
%splat = shufflevector <16 x i32> %head, <16 x i32> poison, <16 x i32> zeroinitializer
1139+
%out = shufflevector <16 x i32> %splat, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
1140+
ret <16 x i32> %out
1141+
}

0 commit comments

Comments
 (0)