Skip to content

Commit 396b6bb

Browse files
authored
[RISCV] Recurse on second operand of two operand shuffles (#79197)
This builds on bdc4110. This change completes the migration to a recursive shuffle lowering strategy where when we encounter an unknown two argument shuffle, we lower each operand as a single source permute, and then use a vselect (i.e. a vmerge) to combine the results. This relies for code quality on the post-isel combine which will aggressively fold that vmerge back into the materialization of the second operand if possible. Note: The change includes only the most immediately obvious of the stylistic cleanup. There's a bunch of code movement that this enables that I'll do as a separate patch as rolling it into this creates an unreadable diff.
1 parent 56aa77e commit 396b6bb

9 files changed

+296
-391
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 17 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -4975,12 +4975,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
49754975

49764976
// As a backup, shuffles can be lowered via a vrgather instruction, possibly
49774977
// merged with a second vrgather.
4978-
SmallVector<SDValue> GatherIndicesLHS, GatherIndicesRHS;
4979-
4980-
// Keep a track of which non-undef indices are used by each LHS/RHS shuffle
4981-
// half.
4982-
DenseMap<int, unsigned> LHSIndexCounts, RHSIndexCounts;
4983-
4978+
SmallVector<int> ShuffleMaskLHS, ShuffleMaskRHS;
49844979
SmallVector<SDValue> MaskVals;
49854980

49864981
// Now construct the mask that will be used by the blended vrgather operation.
@@ -4989,28 +4984,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
49894984
bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ !SwapOps;
49904985
MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
49914986
bool IsLHSOrUndefIndex = MaskIndex < (int)NumElts;
4992-
GatherIndicesLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
4993-
? DAG.getConstant(MaskIndex, DL, XLenVT)
4994-
: DAG.getUNDEF(XLenVT));
4995-
GatherIndicesRHS.push_back(
4996-
IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT)
4997-
: DAG.getConstant(MaskIndex - NumElts, DL, XLenVT));
4998-
if (IsLHSOrUndefIndex && MaskIndex >= 0)
4999-
++LHSIndexCounts[MaskIndex];
5000-
if (!IsLHSOrUndefIndex)
5001-
++RHSIndexCounts[MaskIndex - NumElts];
4987+
ShuffleMaskLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
4988+
? MaskIndex : -1);
4989+
ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts));
50024990
}
50034991

50044992
if (SwapOps) {
50054993
std::swap(V1, V2);
5006-
std::swap(GatherIndicesLHS, GatherIndicesRHS);
4994+
std::swap(ShuffleMaskLHS, ShuffleMaskRHS);
50074995
}
50084996

50094997
assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
50104998
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
50114999
SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
50125000

5013-
unsigned GatherVXOpc = RISCVISD::VRGATHER_VX_VL;
50145001
unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
50155002
MVT IndexVT = VT.changeTypeToInteger();
50165003
// Since we can't introduce illegal index types at this stage, use i16 and
@@ -5038,6 +5025,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
50385025
// are handled above.
50395026
if (V2.isUndef()) {
50405027
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
5028+
SmallVector<SDValue> GatherIndicesLHS;
5029+
for (int ShuffleIdx : ShuffleMaskLHS)
5030+
GatherIndicesLHS.push_back(ShuffleIdx != -1
5031+
? DAG.getConstant(ShuffleIdx, DL, XLenVT)
5032+
: DAG.getUNDEF(XLenVT));
50415033
SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
50425034
LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
50435035
Subtarget);
@@ -5046,50 +5038,13 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
50465038
return convertFromScalableVector(VT, Gather, DAG, Subtarget);
50475039
}
50485040

5049-
// Translate the gather index we computed above (and possibly swapped)
5050-
// back to a shuffle mask. This step should disappear once we complete
5051-
// the migration to recursive design.
5052-
SmallVector<int> ShuffleMaskLHS;
5053-
ShuffleMaskLHS.reserve(GatherIndicesLHS.size());
5054-
for (SDValue GatherIndex : GatherIndicesLHS) {
5055-
if (GatherIndex.isUndef()) {
5056-
ShuffleMaskLHS.push_back(-1);
5057-
continue;
5058-
}
5059-
auto *IdxC = cast<ConstantSDNode>(GatherIndex);
5060-
ShuffleMaskLHS.push_back(IdxC->getZExtValue());
5061-
}
5062-
5063-
// Recursively invoke lowering for the LHS as if there were no RHS.
5064-
// This allows us to leverage all of our single source permute tricks.
5065-
SDValue Gather =
5066-
DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
5067-
Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget);
5068-
5069-
// Blend in second vector source with an additional vrgather.
5070-
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
5071-
5072-
MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
5073-
SelectMask =
5074-
convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
5075-
5076-
// If only one index is used, we can use a "splat" vrgather.
5077-
// TODO: We can splat the most-common index and fix-up any stragglers, if
5078-
// that's beneficial.
5079-
if (RHSIndexCounts.size() == 1) {
5080-
int SplatIndex = RHSIndexCounts.begin()->getFirst();
5081-
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
5082-
DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
5083-
SelectMask, VL);
5084-
} else {
5085-
SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
5086-
RHSIndices =
5087-
convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
5088-
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
5089-
SelectMask, VL);
5090-
}
5091-
5092-
return convertFromScalableVector(VT, Gather, DAG, Subtarget);
5041+
// Recursively invoke lowering for each operand if we had two
5042+
// independent single source permutes, and then combine the result via a
5043+
// vselect. Note that the vselect will likely be folded back into the
5044+
// second permute (vrgather, or other) by the post-isel combine.
5045+
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
5046+
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), ShuffleMaskRHS);
5047+
return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V2, V1);
50935048
}
50945049

50955050
bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -238,26 +238,44 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
238238
define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
239239
; V128-LABEL: interleave_v32f32:
240240
; V128: # %bb.0:
241+
; V128-NEXT: addi sp, sp, -16
242+
; V128-NEXT: .cfi_def_cfa_offset 16
243+
; V128-NEXT: csrr a0, vlenb
244+
; V128-NEXT: slli a0, a0, 3
245+
; V128-NEXT: sub sp, sp, a0
246+
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
247+
; V128-NEXT: vmv8r.v v0, v16
248+
; V128-NEXT: addi a0, sp, 16
249+
; V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
250+
; V128-NEXT: vmv8r.v v16, v8
241251
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
242-
; V128-NEXT: vslidedown.vi v0, v8, 16
252+
; V128-NEXT: vslidedown.vi v8, v0, 16
243253
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
244254
; V128-NEXT: vwaddu.vv v24, v0, v8
245255
; V128-NEXT: li a0, -1
246256
; V128-NEXT: vwmaccu.vx v24, a0, v8
247-
; V128-NEXT: lui a1, %hi(.LCPI10_0)
248-
; V128-NEXT: addi a1, a1, %lo(.LCPI10_0)
249-
; V128-NEXT: li a2, 32
250-
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
251-
; V128-NEXT: vle16.v v12, (a1)
257+
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
258+
; V128-NEXT: vslidedown.vi v0, v16, 16
259+
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
260+
; V128-NEXT: vwaddu.vv v8, v0, v16
261+
; V128-NEXT: vwmaccu.vx v8, a0, v16
252262
; V128-NEXT: lui a1, 699051
253263
; V128-NEXT: addi a1, a1, -1366
264+
; V128-NEXT: li a2, 32
254265
; V128-NEXT: vmv.s.x v0, a1
255-
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
266+
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
267+
; V128-NEXT: vmerge.vvm v24, v8, v24, v0
256268
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
257-
; V128-NEXT: vwaddu.vv v0, v8, v16
258-
; V128-NEXT: vwmaccu.vx v0, a0, v16
269+
; V128-NEXT: addi a1, sp, 16
270+
; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
271+
; V128-NEXT: vwaddu.vv v0, v16, v8
272+
; V128-NEXT: vwmaccu.vx v0, a0, v8
259273
; V128-NEXT: vmv8r.v v8, v0
260274
; V128-NEXT: vmv8r.v v16, v24
275+
; V128-NEXT: csrr a0, vlenb
276+
; V128-NEXT: slli a0, a0, 3
277+
; V128-NEXT: add sp, sp, a0
278+
; V128-NEXT: addi sp, sp, 16
261279
; V128-NEXT: ret
262280
;
263281
; V512-LABEL: interleave_v32f32:

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -403,26 +403,44 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
403403
define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
404404
; V128-LABEL: interleave_v32i32:
405405
; V128: # %bb.0:
406+
; V128-NEXT: addi sp, sp, -16
407+
; V128-NEXT: .cfi_def_cfa_offset 16
408+
; V128-NEXT: csrr a0, vlenb
409+
; V128-NEXT: slli a0, a0, 3
410+
; V128-NEXT: sub sp, sp, a0
411+
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
412+
; V128-NEXT: vmv8r.v v0, v16
413+
; V128-NEXT: addi a0, sp, 16
414+
; V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
415+
; V128-NEXT: vmv8r.v v16, v8
406416
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
407-
; V128-NEXT: vslidedown.vi v0, v8, 16
417+
; V128-NEXT: vslidedown.vi v8, v0, 16
408418
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
409419
; V128-NEXT: vwaddu.vv v24, v0, v8
410420
; V128-NEXT: li a0, -1
411421
; V128-NEXT: vwmaccu.vx v24, a0, v8
412-
; V128-NEXT: lui a1, %hi(.LCPI17_0)
413-
; V128-NEXT: addi a1, a1, %lo(.LCPI17_0)
414-
; V128-NEXT: li a2, 32
415-
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
416-
; V128-NEXT: vle16.v v12, (a1)
422+
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
423+
; V128-NEXT: vslidedown.vi v0, v16, 16
424+
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
425+
; V128-NEXT: vwaddu.vv v8, v0, v16
426+
; V128-NEXT: vwmaccu.vx v8, a0, v16
417427
; V128-NEXT: lui a1, 699051
418428
; V128-NEXT: addi a1, a1, -1366
429+
; V128-NEXT: li a2, 32
419430
; V128-NEXT: vmv.s.x v0, a1
420-
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
431+
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
432+
; V128-NEXT: vmerge.vvm v24, v8, v24, v0
421433
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
422-
; V128-NEXT: vwaddu.vv v0, v8, v16
423-
; V128-NEXT: vwmaccu.vx v0, a0, v16
434+
; V128-NEXT: addi a1, sp, 16
435+
; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
436+
; V128-NEXT: vwaddu.vv v0, v16, v8
437+
; V128-NEXT: vwmaccu.vx v0, a0, v8
424438
; V128-NEXT: vmv8r.v v8, v0
425439
; V128-NEXT: vmv8r.v v16, v24
440+
; V128-NEXT: csrr a0, vlenb
441+
; V128-NEXT: slli a0, a0, 3
442+
; V128-NEXT: add sp, sp, a0
443+
; V128-NEXT: addi sp, sp, 16
426444
; V128-NEXT: ret
427445
;
428446
; V512-LABEL: interleave_v32i32:

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -611,12 +611,10 @@ define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) {
611611
define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
612612
; CHECK-LABEL: concat_4xi8_start_undef_at_start:
613613
; CHECK: # %bb.0:
614-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
615-
; CHECK-NEXT: vid.v v10
616614
; CHECK-NEXT: li a0, 224
615+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
617616
; CHECK-NEXT: vmv.s.x v0, a0
618-
; CHECK-NEXT: vadd.vi v10, v10, -4
619-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
617+
; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t
620618
; CHECK-NEXT: ret
621619
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
622620
ret <8 x i8> %res
@@ -625,12 +623,10 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
625623
define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
626624
; CHECK-LABEL: merge_start_into_end_non_contiguous:
627625
; CHECK: # %bb.0:
628-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
629-
; CHECK-NEXT: vid.v v10
630626
; CHECK-NEXT: li a0, 144
627+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
631628
; CHECK-NEXT: vmv.s.x v0, a0
632-
; CHECK-NEXT: vadd.vi v10, v10, -4
633-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
629+
; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t
634630
; CHECK-NEXT: ret
635631
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 11>
636632
ret <8 x i8> %res
@@ -670,12 +666,11 @@ define <8 x i8> @merge_start_into_start(<8 x i8> %v, <8 x i8> %w) {
670666
define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
671667
; CHECK-LABEL: merge_slidedown:
672668
; CHECK: # %bb.0:
673-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
674-
; CHECK-NEXT: vslidedown.vi v8, v8, 1
669+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
675670
; CHECK-NEXT: li a0, 195
676671
; CHECK-NEXT: vmv.s.x v0, a0
677-
; CHECK-NEXT: vid.v v10
678-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
672+
; CHECK-NEXT: vslidedown.vi v8, v8, 1
673+
; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
679674
; CHECK-NEXT: ret
680675
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
681676
ret <8 x i8> %res
@@ -686,12 +681,10 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
686681
; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
687682
; CHECK: # %bb.0:
688683
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
689-
; CHECK-NEXT: vid.v v10
690-
; CHECK-NEXT: vadd.vi v10, v10, -1
691684
; CHECK-NEXT: li a0, 234
692685
; CHECK-NEXT: vmv.s.x v0, a0
693686
; CHECK-NEXT: vslidedown.vi v8, v8, 2
694-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
687+
; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t
695688
; CHECK-NEXT: ret
696689
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
697690
ret <8 x i8> %res

0 commit comments

Comments
 (0)