Skip to content

Commit 325d4a1

Browse files
authored
Revert "[RISCV] Recurse on first operand of two operand shuffles (#79180)" (#80238)
This reverts commit bdc4110 on the release/18.x branch. This change was the first in a mini-series and while I'm not aware of any particular problem from having it on it's own in the branch, it seems safer to ship with the previous known good state.
1 parent ab57f6c commit 325d4a1

6 files changed

+407
-347
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 44 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -5033,60 +5033,56 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
50335033
MVT IndexContainerVT =
50345034
ContainerVT.changeVectorElementType(IndexVT.getScalarType());
50355035

5036-
// Base case for the recursion just below - handle the worst case
5037-
// single source permutation. Note that all the splat variants
5038-
// are handled above.
5039-
if (V2.isUndef()) {
5036+
SDValue Gather;
5037+
// TODO: This doesn't trigger for i64 vectors on RV32, since there we
5038+
// encounter a bitcasted BUILD_VECTOR with low/high i32 values.
5039+
if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
5040+
Gather = lowerScalarSplat(SDValue(), SplatValue, VL, ContainerVT, DL, DAG,
5041+
Subtarget);
5042+
} else {
50405043
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
5041-
SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
5042-
LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
5043-
Subtarget);
5044-
SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
5045-
DAG.getUNDEF(ContainerVT), TrueMask, VL);
5046-
return convertFromScalableVector(VT, Gather, DAG, Subtarget);
5047-
}
5048-
5049-
// Translate the gather index we computed above (and possibly swapped)
5050-
// back to a shuffle mask. This step should disappear once we complete
5051-
// the migration to recursive design.
5052-
SmallVector<int> ShuffleMaskLHS;
5053-
ShuffleMaskLHS.reserve(GatherIndicesLHS.size());
5054-
for (SDValue GatherIndex : GatherIndicesLHS) {
5055-
if (GatherIndex.isUndef()) {
5056-
ShuffleMaskLHS.push_back(-1);
5057-
continue;
5044+
// If only one index is used, we can use a "splat" vrgather.
5045+
// TODO: We can splat the most-common index and fix-up any stragglers, if
5046+
// that's beneficial.
5047+
if (LHSIndexCounts.size() == 1) {
5048+
int SplatIndex = LHSIndexCounts.begin()->getFirst();
5049+
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
5050+
DAG.getConstant(SplatIndex, DL, XLenVT),
5051+
DAG.getUNDEF(ContainerVT), TrueMask, VL);
5052+
} else {
5053+
SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
5054+
LHSIndices =
5055+
convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
5056+
5057+
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
5058+
DAG.getUNDEF(ContainerVT), TrueMask, VL);
50585059
}
5059-
auto *IdxC = cast<ConstantSDNode>(GatherIndex);
5060-
ShuffleMaskLHS.push_back(IdxC->getZExtValue());
50615060
}
50625061

5063-
// Recursively invoke lowering for the LHS as if there were no RHS.
5064-
// This allows us to leverage all of our single source permute tricks.
5065-
SDValue Gather =
5066-
DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
5067-
Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget);
5062+
// If a second vector operand is used by this shuffle, blend it in with an
5063+
// additional vrgather.
5064+
if (!V2.isUndef()) {
5065+
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
50685066

5069-
// Blend in second vector source with an additional vrgather.
5070-
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
5067+
MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
5068+
SelectMask =
5069+
convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
50715070

5072-
MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
5073-
SelectMask =
5074-
convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
5075-
5076-
// If only one index is used, we can use a "splat" vrgather.
5077-
// TODO: We can splat the most-common index and fix-up any stragglers, if
5078-
// that's beneficial.
5079-
if (RHSIndexCounts.size() == 1) {
5080-
int SplatIndex = RHSIndexCounts.begin()->getFirst();
5081-
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
5082-
DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
5083-
SelectMask, VL);
5084-
} else {
5085-
SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
5086-
RHSIndices =
5087-
convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
5088-
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
5089-
SelectMask, VL);
5071+
// If only one index is used, we can use a "splat" vrgather.
5072+
// TODO: We can splat the most-common index and fix-up any stragglers, if
5073+
// that's beneficial.
5074+
if (RHSIndexCounts.size() == 1) {
5075+
int SplatIndex = RHSIndexCounts.begin()->getFirst();
5076+
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
5077+
DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
5078+
SelectMask, VL);
5079+
} else {
5080+
SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
5081+
RHSIndices =
5082+
convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
5083+
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
5084+
SelectMask, VL);
5085+
}
50905086
}
50915087

50925088
return convertFromScalableVector(VT, Gather, DAG, Subtarget);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -238,26 +238,39 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
238238
define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
239239
; V128-LABEL: interleave_v32f32:
240240
; V128: # %bb.0:
241-
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
242-
; V128-NEXT: vslidedown.vi v0, v8, 16
243-
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
244-
; V128-NEXT: vwaddu.vv v24, v0, v8
245-
; V128-NEXT: li a0, -1
246-
; V128-NEXT: vwmaccu.vx v24, a0, v8
247-
; V128-NEXT: lui a1, %hi(.LCPI10_0)
248-
; V128-NEXT: addi a1, a1, %lo(.LCPI10_0)
249-
; V128-NEXT: li a2, 32
250-
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
251-
; V128-NEXT: vle16.v v12, (a1)
252-
; V128-NEXT: lui a1, 699051
253-
; V128-NEXT: addi a1, a1, -1366
254-
; V128-NEXT: vmv.s.x v0, a1
241+
; V128-NEXT: addi sp, sp, -16
242+
; V128-NEXT: .cfi_def_cfa_offset 16
243+
; V128-NEXT: csrr a0, vlenb
244+
; V128-NEXT: slli a0, a0, 2
245+
; V128-NEXT: sub sp, sp, a0
246+
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
247+
; V128-NEXT: lui a0, %hi(.LCPI10_0)
248+
; V128-NEXT: addi a0, a0, %lo(.LCPI10_0)
249+
; V128-NEXT: li a1, 32
250+
; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
251+
; V128-NEXT: vle16.v v4, (a0)
252+
; V128-NEXT: lui a0, %hi(.LCPI10_1)
253+
; V128-NEXT: addi a0, a0, %lo(.LCPI10_1)
254+
; V128-NEXT: vle16.v v24, (a0)
255+
; V128-NEXT: addi a0, sp, 16
256+
; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
257+
; V128-NEXT: lui a0, 699051
258+
; V128-NEXT: addi a0, a0, -1366
259+
; V128-NEXT: vmv.s.x v0, a0
260+
; V128-NEXT: vrgatherei16.vv v24, v8, v4
261+
; V128-NEXT: addi a0, sp, 16
262+
; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
255263
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
256264
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
257265
; V128-NEXT: vwaddu.vv v0, v8, v16
266+
; V128-NEXT: li a0, -1
258267
; V128-NEXT: vwmaccu.vx v0, a0, v16
259268
; V128-NEXT: vmv8r.v v8, v0
260269
; V128-NEXT: vmv8r.v v16, v24
270+
; V128-NEXT: csrr a0, vlenb
271+
; V128-NEXT: slli a0, a0, 2
272+
; V128-NEXT: add sp, sp, a0
273+
; V128-NEXT: addi sp, sp, 16
261274
; V128-NEXT: ret
262275
;
263276
; V512-LABEL: interleave_v32f32:

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll

Lines changed: 35 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -188,30 +188,24 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
188188
define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
189189
; V128-LABEL: interleave_v4i32_offset_1:
190190
; V128: # %bb.0:
191-
; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
192-
; V128-NEXT: vwaddu.vv v10, v8, v8
193-
; V128-NEXT: li a0, -1
194-
; V128-NEXT: vwmaccu.vx v10, a0, v8
195191
; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
196-
; V128-NEXT: vid.v v8
197-
; V128-NEXT: vsrl.vi v8, v8, 1
192+
; V128-NEXT: vid.v v10
193+
; V128-NEXT: vsrl.vi v11, v10, 1
194+
; V128-NEXT: vrgather.vv v10, v8, v11
198195
; V128-NEXT: vmv.v.i v0, 10
199-
; V128-NEXT: vadd.vi v8, v8, 1
196+
; V128-NEXT: vadd.vi v8, v11, 1
200197
; V128-NEXT: vrgather.vv v10, v9, v8, v0.t
201198
; V128-NEXT: vmv.v.v v8, v10
202199
; V128-NEXT: ret
203200
;
204201
; V512-LABEL: interleave_v4i32_offset_1:
205202
; V512: # %bb.0:
206-
; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
207-
; V512-NEXT: vwaddu.vv v10, v8, v8
208-
; V512-NEXT: li a0, -1
209-
; V512-NEXT: vwmaccu.vx v10, a0, v8
210203
; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
211-
; V512-NEXT: vid.v v8
212-
; V512-NEXT: vsrl.vi v8, v8, 1
204+
; V512-NEXT: vid.v v10
205+
; V512-NEXT: vsrl.vi v11, v10, 1
206+
; V512-NEXT: vrgather.vv v10, v8, v11
213207
; V512-NEXT: vmv.v.i v0, 10
214-
; V512-NEXT: vadd.vi v8, v8, 1
208+
; V512-NEXT: vadd.vi v8, v11, 1
215209
; V512-NEXT: vrgather.vv v10, v9, v8, v0.t
216210
; V512-NEXT: vmv1r.v v8, v10
217211
; V512-NEXT: ret
@@ -403,26 +397,39 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
403397
define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
404398
; V128-LABEL: interleave_v32i32:
405399
; V128: # %bb.0:
406-
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
407-
; V128-NEXT: vslidedown.vi v0, v8, 16
408-
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
409-
; V128-NEXT: vwaddu.vv v24, v0, v8
410-
; V128-NEXT: li a0, -1
411-
; V128-NEXT: vwmaccu.vx v24, a0, v8
412-
; V128-NEXT: lui a1, %hi(.LCPI17_0)
413-
; V128-NEXT: addi a1, a1, %lo(.LCPI17_0)
414-
; V128-NEXT: li a2, 32
415-
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
416-
; V128-NEXT: vle16.v v12, (a1)
417-
; V128-NEXT: lui a1, 699051
418-
; V128-NEXT: addi a1, a1, -1366
419-
; V128-NEXT: vmv.s.x v0, a1
400+
; V128-NEXT: addi sp, sp, -16
401+
; V128-NEXT: .cfi_def_cfa_offset 16
402+
; V128-NEXT: csrr a0, vlenb
403+
; V128-NEXT: slli a0, a0, 2
404+
; V128-NEXT: sub sp, sp, a0
405+
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
406+
; V128-NEXT: lui a0, %hi(.LCPI17_0)
407+
; V128-NEXT: addi a0, a0, %lo(.LCPI17_0)
408+
; V128-NEXT: li a1, 32
409+
; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
410+
; V128-NEXT: vle16.v v4, (a0)
411+
; V128-NEXT: lui a0, %hi(.LCPI17_1)
412+
; V128-NEXT: addi a0, a0, %lo(.LCPI17_1)
413+
; V128-NEXT: vle16.v v24, (a0)
414+
; V128-NEXT: addi a0, sp, 16
415+
; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
416+
; V128-NEXT: lui a0, 699051
417+
; V128-NEXT: addi a0, a0, -1366
418+
; V128-NEXT: vmv.s.x v0, a0
419+
; V128-NEXT: vrgatherei16.vv v24, v8, v4
420+
; V128-NEXT: addi a0, sp, 16
421+
; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
420422
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
421423
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
422424
; V128-NEXT: vwaddu.vv v0, v8, v16
425+
; V128-NEXT: li a0, -1
423426
; V128-NEXT: vwmaccu.vx v0, a0, v16
424427
; V128-NEXT: vmv8r.v v8, v0
425428
; V128-NEXT: vmv8r.v v16, v24
429+
; V128-NEXT: csrr a0, vlenb
430+
; V128-NEXT: slli a0, a0, 2
431+
; V128-NEXT: add sp, sp, a0
432+
; V128-NEXT: addi sp, sp, 16
426433
; V128-NEXT: ret
427434
;
428435
; V512-LABEL: interleave_v32i32:

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -612,11 +612,13 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
612612
; CHECK-LABEL: concat_4xi8_start_undef_at_start:
613613
; CHECK: # %bb.0:
614614
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
615-
; CHECK-NEXT: vid.v v10
615+
; CHECK-NEXT: vid.v v11
616+
; CHECK-NEXT: vrgather.vv v10, v8, v11
616617
; CHECK-NEXT: li a0, 224
617618
; CHECK-NEXT: vmv.s.x v0, a0
618-
; CHECK-NEXT: vadd.vi v10, v10, -4
619-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
619+
; CHECK-NEXT: vadd.vi v8, v11, -4
620+
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
621+
; CHECK-NEXT: vmv1r.v v8, v10
620622
; CHECK-NEXT: ret
621623
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
622624
ret <8 x i8> %res
@@ -626,11 +628,13 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
626628
; CHECK-LABEL: merge_start_into_end_non_contiguous:
627629
; CHECK: # %bb.0:
628630
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
629-
; CHECK-NEXT: vid.v v10
631+
; CHECK-NEXT: vid.v v11
632+
; CHECK-NEXT: vrgather.vv v10, v8, v11
630633
; CHECK-NEXT: li a0, 144
631634
; CHECK-NEXT: vmv.s.x v0, a0
632-
; CHECK-NEXT: vadd.vi v10, v10, -4
633-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
635+
; CHECK-NEXT: vadd.vi v8, v11, -4
636+
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
637+
; CHECK-NEXT: vmv1r.v v8, v10
634638
; CHECK-NEXT: ret
635639
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 11>
636640
ret <8 x i8> %res
@@ -671,11 +675,13 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
671675
; CHECK-LABEL: merge_slidedown:
672676
; CHECK: # %bb.0:
673677
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
674-
; CHECK-NEXT: vslidedown.vi v8, v8, 1
678+
; CHECK-NEXT: vid.v v11
679+
; CHECK-NEXT: vadd.vi v12, v11, 1
675680
; CHECK-NEXT: li a0, 195
676681
; CHECK-NEXT: vmv.s.x v0, a0
677-
; CHECK-NEXT: vid.v v10
678-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
682+
; CHECK-NEXT: vrgather.vv v10, v8, v12
683+
; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t
684+
; CHECK-NEXT: vmv1r.v v8, v10
679685
; CHECK-NEXT: ret
680686
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
681687
ret <8 x i8> %res
@@ -686,12 +692,14 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
686692
; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
687693
; CHECK: # %bb.0:
688694
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
689-
; CHECK-NEXT: vid.v v10
690-
; CHECK-NEXT: vadd.vi v10, v10, -1
695+
; CHECK-NEXT: vid.v v11
696+
; CHECK-NEXT: vadd.vi v12, v11, 2
697+
; CHECK-NEXT: vrgather.vv v10, v8, v12
691698
; CHECK-NEXT: li a0, 234
692699
; CHECK-NEXT: vmv.s.x v0, a0
693-
; CHECK-NEXT: vslidedown.vi v8, v8, 2
694-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
700+
; CHECK-NEXT: vadd.vi v8, v11, -1
701+
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
702+
; CHECK-NEXT: vmv1r.v v8, v10
695703
; CHECK-NEXT: ret
696704
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
697705
ret <8 x i8> %res
@@ -702,13 +710,16 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
702710
; CHECK-LABEL: unmergable:
703711
; CHECK: # %bb.0:
704712
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
713+
; CHECK-NEXT: vid.v v10
714+
; CHECK-NEXT: vadd.vi v11, v10, 2
705715
; CHECK-NEXT: lui a0, %hi(.LCPI46_0)
706716
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
707-
; CHECK-NEXT: vle8.v v10, (a0)
717+
; CHECK-NEXT: vle8.v v12, (a0)
708718
; CHECK-NEXT: li a0, 234
709719
; CHECK-NEXT: vmv.s.x v0, a0
710-
; CHECK-NEXT: vslidedown.vi v8, v8, 2
711-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
720+
; CHECK-NEXT: vrgather.vv v10, v8, v11
721+
; CHECK-NEXT: vrgather.vv v10, v9, v12, v0.t
722+
; CHECK-NEXT: vmv1r.v v8, v10
712723
; CHECK-NEXT: ret
713724
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
714725
ret <8 x i8> %res

0 commit comments

Comments
 (0)