Skip to content

Commit c44ca73

Browse files
committed
[RISCV] Combine vslidedown_vl with known VL and offset to a smaller LMUL
If we know the VL and offset of a vslidedown_vl, we can work out the minimum number of registers it's going to operate across. We can reuse the logic from extract_vector_elt to perform it in a smaller type and reduce the LMUL. The aim is to generalize llvm#65598 and hopefully extend this to vslideup_vl too so that we can get the same optimisation for insert_subvector and insert_vector_elt. One observation from adding this is that the vslide*_vl nodes all take a mask operand, but currently anything other than vmset_vl will fail to select, as all the patterns expect true_mask. So we need to create a new vmset_vl instead of using extract_subvector on the existing vmset_vl.
1 parent d346613 commit c44ca73

File tree

3 files changed

+135
-89
lines changed

3 files changed

+135
-89
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8805,15 +8805,6 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
88058805
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
88068806
}
88078807

8808-
// Shrink down Vec so we're performing the slidedown on a smaller LMUL.
8809-
unsigned LastIdx = OrigIdx + SubVecVT.getVectorNumElements() - 1;
8810-
if (auto ShrunkVT =
8811-
getSmallestVTForIndex(ContainerVT, LastIdx, DL, DAG, Subtarget)) {
8812-
ContainerVT = *ShrunkVT;
8813-
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
8814-
DAG.getVectorIdxConstant(0, DL));
8815-
}
8816-
88178808
SDValue Mask =
88188809
getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
88198810
// Set the vector length to only the number of elements we care about. This
@@ -14260,6 +14251,43 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
1426014251
if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
1426114252
return V;
1426214253
break;
14254+
case RISCVISD::VSLIDEDOWN_VL: {
14255+
MVT OrigVT = N->getSimpleValueType(0);
14256+
auto *CVL = dyn_cast<ConstantSDNode>(N->getOperand(4));
14257+
auto *CIdx = dyn_cast<ConstantSDNode>(N->getOperand(2));
14258+
if (!CVL || !CIdx)
14259+
break;
14260+
unsigned MaxIdx = CVL->getZExtValue() + CIdx->getZExtValue() - 1;
14261+
// We can try and reduce the LMUL that a vslidedown uses if we know where
14262+
// the maximum index is. For example, if the target has Zvl128b, a
14263+
// vslidedown of e32 with with an offset of 4 and VL of 2 is only going to
14264+
// read from the first 2 registers at most. So if we were operating at
14265+
// LMUL=4 (nxv8i32), we can reduce it to LMUL=2(nxv4i32).
14266+
if (auto ShrunkVT =
14267+
getSmallestVTForIndex(OrigVT, MaxIdx, DL, DAG, Subtarget)) {
14268+
SDValue ShrunkPassthru =
14269+
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, *ShrunkVT, N->getOperand(0),
14270+
DAG.getVectorIdxConstant(0, DL));
14271+
SDValue ShrunkInVec =
14272+
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, *ShrunkVT, N->getOperand(1),
14273+
DAG.getVectorIdxConstant(0, DL));
14274+
14275+
// The only mask ever used in vslide*_vl nodes is vmset_vl, and the only
14276+
// patterns on vslide*_vl only accept vmset_vl. So create a new vmset
14277+
// since using an extract_subvector breaks patterns.
14278+
assert(N->getOperand(3).getOpcode() == RISCVISD::VMSET_VL);
14279+
SDValue ShrunkMask =
14280+
DAG.getNode(RISCVISD::VMSET_VL, SDLoc(N), getMaskTypeFor(*ShrunkVT),
14281+
N->getOperand(4));
14282+
SDValue ShrunkSlidedown =
14283+
DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, *ShrunkVT,
14284+
{ShrunkPassthru, ShrunkInVec, N->getOperand(2),
14285+
ShrunkMask, N->getOperand(4), N->getOperand(5)});
14286+
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigVT, N->getOperand(0),
14287+
ShrunkSlidedown, DAG.getVectorIdxConstant(0, DL));
14288+
}
14289+
break;
14290+
}
1426314291
case RISCVISD::VFMV_V_F_VL: {
1426414292
const MVT VT = N->getSimpleValueType(0);
1426514293
SDValue Passthru = N->getOperand(0);

llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -679,12 +679,13 @@ define i64 @extractelt_nxv4i64_0(<vscale x 4 x i64> %v) {
679679
define i64 @extractelt_nxv4i64_imm(<vscale x 4 x i64> %v) {
680680
; CHECK-LABEL: extractelt_nxv4i64_imm:
681681
; CHECK: # %bb.0:
682-
; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, ma
682+
; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, ma
683683
; CHECK-NEXT: vslidedown.vi v8, v8, 2
684-
; CHECK-NEXT: li a0, 32
685-
; CHECK-NEXT: vsrl.vx v12, v8, a0
686-
; CHECK-NEXT: vmv.x.s a1, v12
687684
; CHECK-NEXT: vmv.x.s a0, v8
685+
; CHECK-NEXT: li a1, 32
686+
; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, ma
687+
; CHECK-NEXT: vsrl.vx v8, v8, a1
688+
; CHECK-NEXT: vmv.x.s a1, v8
688689
; CHECK-NEXT: ret
689690
%r = extractelement <vscale x 4 x i64> %v, i32 2
690691
ret i64 %r
@@ -720,12 +721,13 @@ define i64 @extractelt_nxv8i64_0(<vscale x 8 x i64> %v) {
720721
define i64 @extractelt_nxv8i64_imm(<vscale x 8 x i64> %v) {
721722
; CHECK-LABEL: extractelt_nxv8i64_imm:
722723
; CHECK: # %bb.0:
723-
; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma
724+
; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, ma
724725
; CHECK-NEXT: vslidedown.vi v8, v8, 2
725-
; CHECK-NEXT: li a0, 32
726-
; CHECK-NEXT: vsrl.vx v16, v8, a0
727-
; CHECK-NEXT: vmv.x.s a1, v16
728726
; CHECK-NEXT: vmv.x.s a0, v8
727+
; CHECK-NEXT: li a1, 32
728+
; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma
729+
; CHECK-NEXT: vsrl.vx v8, v8, a1
730+
; CHECK-NEXT: vmv.x.s a1, v8
729731
; CHECK-NEXT: ret
730732
%r = extractelement <vscale x 8 x i64> %v, i32 2
731733
ret i64 %r

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll

Lines changed: 88 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -875,11 +875,15 @@ define i64 @explode_8xi64(<8 x i64> %v) {
875875
; RV32-NEXT: vsrl.vx v12, v8, a0
876876
; RV32-NEXT: vmv.x.s a1, v12
877877
; RV32-NEXT: vmv.x.s a2, v8
878+
; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
878879
; RV32-NEXT: vslidedown.vi v12, v8, 1
879-
; RV32-NEXT: vsrl.vx v16, v12, a0
880-
; RV32-NEXT: vmv.x.s a3, v16
880+
; RV32-NEXT: vmv.x.s a3, v12
881+
; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
882+
; RV32-NEXT: vsrl.vx v12, v12, a0
881883
; RV32-NEXT: vmv.x.s a4, v12
884+
; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
882885
; RV32-NEXT: vslidedown.vi v12, v8, 2
886+
; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
883887
; RV32-NEXT: vsrl.vx v16, v12, a0
884888
; RV32-NEXT: vmv.x.s a5, v16
885889
; RV32-NEXT: vmv.x.s a6, v12
@@ -903,19 +907,19 @@ define i64 @explode_8xi64(<8 x i64> %v) {
903907
; RV32-NEXT: vsrl.vx v12, v8, a0
904908
; RV32-NEXT: vmv.x.s a0, v12
905909
; RV32-NEXT: vmv.x.s s0, v8
906-
; RV32-NEXT: add a1, a1, a3
907-
; RV32-NEXT: add a4, a2, a4
908-
; RV32-NEXT: sltu a2, a4, a2
910+
; RV32-NEXT: add a1, a1, a4
911+
; RV32-NEXT: add a3, a2, a3
912+
; RV32-NEXT: sltu a2, a3, a2
909913
; RV32-NEXT: add a1, a1, a2
910-
; RV32-NEXT: add a6, a4, a6
911-
; RV32-NEXT: sltu a2, a6, a4
914+
; RV32-NEXT: add a6, a3, a6
915+
; RV32-NEXT: sltu a2, a6, a3
912916
; RV32-NEXT: add a1, a1, a5
913-
; RV32-NEXT: add a2, a2, a7
914-
; RV32-NEXT: add a1, a1, a2
915917
; RV32-NEXT: add t0, a6, t0
916-
; RV32-NEXT: sltu a2, t0, a6
917-
; RV32-NEXT: add a2, a2, t1
918+
; RV32-NEXT: sltu a3, t0, a6
919+
; RV32-NEXT: add a2, a2, a7
918920
; RV32-NEXT: add a1, a1, a2
921+
; RV32-NEXT: add a3, a3, t1
922+
; RV32-NEXT: add a1, a1, a3
919923
; RV32-NEXT: add t2, t0, t2
920924
; RV32-NEXT: sltu a2, t2, t0
921925
; RV32-NEXT: add a2, a2, t3
@@ -1029,115 +1033,127 @@ define i64 @explode_16xi64(<16 x i64> %v) {
10291033
; RV32-NEXT: vmv.x.s a0, v16
10301034
; RV32-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
10311035
; RV32-NEXT: vmv.x.s a0, v8
1036+
; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
10321037
; RV32-NEXT: vslidedown.vi v16, v8, 1
1038+
; RV32-NEXT: vmv.x.s a3, v16
1039+
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
1040+
; RV32-NEXT: vsrl.vx v16, v16, a1
1041+
; RV32-NEXT: vmv.x.s a4, v16
1042+
; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
1043+
; RV32-NEXT: vslidedown.vi v16, v8, 2
1044+
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
10331045
; RV32-NEXT: vsrl.vx v24, v16, a1
10341046
; RV32-NEXT: vmv.x.s a5, v24
10351047
; RV32-NEXT: vmv.x.s a6, v16
1036-
; RV32-NEXT: vslidedown.vi v16, v8, 2
1037-
; RV32-NEXT: vsrl.vx v24, v16, a1
1038-
; RV32-NEXT: vmv.x.s a3, v24
1039-
; RV32-NEXT: vmv.x.s a4, v16
1048+
; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
10401049
; RV32-NEXT: vslidedown.vi v16, v8, 3
1050+
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
10411051
; RV32-NEXT: vsrl.vx v24, v16, a1
1042-
; RV32-NEXT: vmv.x.s s2, v24
1052+
; RV32-NEXT: vmv.x.s t0, v24
10431053
; RV32-NEXT: vmv.x.s a7, v16
1054+
; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
10441055
; RV32-NEXT: vslidedown.vi v16, v8, 4
1045-
; RV32-NEXT: vsrl.vx v24, v16, a1
1046-
; RV32-NEXT: vmv.x.s s3, v24
1047-
; RV32-NEXT: vmv.x.s t0, v16
1048-
; RV32-NEXT: vslidedown.vi v16, v8, 5
1049-
; RV32-NEXT: vsrl.vx v24, v16, a1
1050-
; RV32-NEXT: vmv.x.s s4, v24
10511056
; RV32-NEXT: vmv.x.s t1, v16
1052-
; RV32-NEXT: vslidedown.vi v16, v8, 6
1053-
; RV32-NEXT: vsrl.vx v24, v16, a1
1054-
; RV32-NEXT: vmv.x.s s5, v24
1057+
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
1058+
; RV32-NEXT: vsrl.vx v16, v16, a1
1059+
; RV32-NEXT: vmv.x.s t3, v16
1060+
; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
1061+
; RV32-NEXT: vslidedown.vi v16, v8, 5
10551062
; RV32-NEXT: vmv.x.s t2, v16
1063+
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
1064+
; RV32-NEXT: vsrl.vx v16, v16, a1
1065+
; RV32-NEXT: vmv.x.s t5, v16
1066+
; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
1067+
; RV32-NEXT: vslidedown.vi v16, v8, 6
1068+
; RV32-NEXT: vmv.x.s t4, v16
1069+
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
1070+
; RV32-NEXT: vsrl.vx v16, v16, a1
1071+
; RV32-NEXT: vmv.x.s ra, v16
10561072
; RV32-NEXT: vslidedown.vi v16, v8, 7
10571073
; RV32-NEXT: vsrl.vx v24, v16, a1
1058-
; RV32-NEXT: vmv.x.s s6, v24
1059-
; RV32-NEXT: vmv.x.s t3, v16
1074+
; RV32-NEXT: vmv.x.s s5, v24
1075+
; RV32-NEXT: vmv.x.s t6, v16
10601076
; RV32-NEXT: vslidedown.vi v16, v8, 8
10611077
; RV32-NEXT: vsrl.vx v24, v16, a1
1062-
; RV32-NEXT: vmv.x.s s7, v24
1063-
; RV32-NEXT: vmv.x.s t4, v16
1078+
; RV32-NEXT: vmv.x.s s6, v24
1079+
; RV32-NEXT: vmv.x.s s0, v16
10641080
; RV32-NEXT: vslidedown.vi v16, v8, 9
10651081
; RV32-NEXT: vsrl.vx v24, v16, a1
1066-
; RV32-NEXT: vmv.x.s s8, v24
1067-
; RV32-NEXT: vmv.x.s t5, v16
1082+
; RV32-NEXT: vmv.x.s s7, v24
1083+
; RV32-NEXT: vmv.x.s s1, v16
10681084
; RV32-NEXT: vslidedown.vi v16, v8, 10
10691085
; RV32-NEXT: vsrl.vx v24, v16, a1
1070-
; RV32-NEXT: vmv.x.s s9, v24
1071-
; RV32-NEXT: vmv.x.s t6, v16
1086+
; RV32-NEXT: vmv.x.s s8, v24
1087+
; RV32-NEXT: vmv.x.s s2, v16
10721088
; RV32-NEXT: vslidedown.vi v16, v8, 11
10731089
; RV32-NEXT: vsrl.vx v24, v16, a1
1074-
; RV32-NEXT: vmv.x.s s10, v24
1075-
; RV32-NEXT: vmv.x.s s0, v16
1090+
; RV32-NEXT: vmv.x.s s9, v24
1091+
; RV32-NEXT: vmv.x.s s3, v16
10761092
; RV32-NEXT: vslidedown.vi v16, v8, 12
10771093
; RV32-NEXT: vsrl.vx v24, v16, a1
1078-
; RV32-NEXT: vmv.x.s s11, v24
1079-
; RV32-NEXT: vmv.x.s s1, v16
1094+
; RV32-NEXT: vmv.x.s s10, v24
1095+
; RV32-NEXT: vmv.x.s s4, v16
10801096
; RV32-NEXT: vslidedown.vi v0, v8, 13
10811097
; RV32-NEXT: vsrl.vx v16, v0, a1
1082-
; RV32-NEXT: vmv.x.s ra, v16
1098+
; RV32-NEXT: vmv.x.s s11, v16
10831099
; RV32-NEXT: vslidedown.vi v16, v8, 14
10841100
; RV32-NEXT: vsrl.vx v24, v16, a1
10851101
; RV32-NEXT: vslidedown.vi v8, v8, 15
10861102
; RV32-NEXT: vmv.x.s a2, v0
10871103
; RV32-NEXT: vsrl.vx v0, v8, a1
10881104
; RV32-NEXT: lw a1, 8(sp) # 4-byte Folded Reload
1089-
; RV32-NEXT: add a5, a1, a5
1090-
; RV32-NEXT: add a6, a0, a6
1091-
; RV32-NEXT: sltu a0, a6, a0
1092-
; RV32-NEXT: add a0, a5, a0
1093-
; RV32-NEXT: add a0, a0, a3
1094-
; RV32-NEXT: add a4, a6, a4
1095-
; RV32-NEXT: sltu a1, a4, a6
1096-
; RV32-NEXT: add a1, a1, s2
1105+
; RV32-NEXT: add a4, a1, a4
1106+
; RV32-NEXT: add a3, a0, a3
1107+
; RV32-NEXT: sltu a0, a3, a0
1108+
; RV32-NEXT: add a0, a4, a0
1109+
; RV32-NEXT: add a0, a0, a5
1110+
; RV32-NEXT: add a6, a3, a6
1111+
; RV32-NEXT: sltu a1, a6, a3
1112+
; RV32-NEXT: add a1, a1, t0
10971113
; RV32-NEXT: add a0, a0, a1
1098-
; RV32-NEXT: add a7, a4, a7
1099-
; RV32-NEXT: sltu a1, a7, a4
1100-
; RV32-NEXT: add a1, a1, s3
1114+
; RV32-NEXT: add a7, a6, a7
1115+
; RV32-NEXT: sltu a1, a7, a6
1116+
; RV32-NEXT: add a1, a1, t3
11011117
; RV32-NEXT: add a0, a0, a1
1102-
; RV32-NEXT: add t0, a7, t0
1103-
; RV32-NEXT: sltu a1, t0, a7
1104-
; RV32-NEXT: add a1, a1, s4
1105-
; RV32-NEXT: add a0, a0, a1
1106-
; RV32-NEXT: add t1, t0, t1
1107-
; RV32-NEXT: sltu a1, t1, t0
1108-
; RV32-NEXT: add a1, a1, s5
1118+
; RV32-NEXT: add t1, a7, t1
1119+
; RV32-NEXT: sltu a1, t1, a7
1120+
; RV32-NEXT: add a1, a1, t5
11091121
; RV32-NEXT: add a0, a0, a1
11101122
; RV32-NEXT: add t2, t1, t2
11111123
; RV32-NEXT: sltu a1, t2, t1
1124+
; RV32-NEXT: add a1, a1, ra
1125+
; RV32-NEXT: add a0, a0, a1
1126+
; RV32-NEXT: add t4, t2, t4
1127+
; RV32-NEXT: sltu a1, t4, t2
1128+
; RV32-NEXT: add a1, a1, s5
1129+
; RV32-NEXT: add a0, a0, a1
1130+
; RV32-NEXT: add t6, t4, t6
1131+
; RV32-NEXT: sltu a1, t6, t4
11121132
; RV32-NEXT: add a1, a1, s6
11131133
; RV32-NEXT: add a0, a0, a1
1114-
; RV32-NEXT: add t3, t2, t3
1115-
; RV32-NEXT: sltu a1, t3, t2
1134+
; RV32-NEXT: add s0, t6, s0
1135+
; RV32-NEXT: sltu a1, s0, t6
11161136
; RV32-NEXT: add a1, a1, s7
11171137
; RV32-NEXT: add a0, a0, a1
1118-
; RV32-NEXT: add t4, t3, t4
1119-
; RV32-NEXT: sltu a1, t4, t3
1138+
; RV32-NEXT: add s1, s0, s1
1139+
; RV32-NEXT: sltu a1, s1, s0
11201140
; RV32-NEXT: add a1, a1, s8
11211141
; RV32-NEXT: add a0, a0, a1
1122-
; RV32-NEXT: add t5, t4, t5
1123-
; RV32-NEXT: sltu a1, t5, t4
1142+
; RV32-NEXT: add s2, s1, s2
1143+
; RV32-NEXT: sltu a1, s2, s1
11241144
; RV32-NEXT: add a1, a1, s9
11251145
; RV32-NEXT: add a0, a0, a1
1126-
; RV32-NEXT: add t6, t5, t6
1127-
; RV32-NEXT: sltu a1, t6, t5
1146+
; RV32-NEXT: add s3, s2, s3
1147+
; RV32-NEXT: sltu a1, s3, s2
11281148
; RV32-NEXT: add a1, a1, s10
11291149
; RV32-NEXT: add a0, a0, a1
1130-
; RV32-NEXT: add s0, t6, s0
1131-
; RV32-NEXT: sltu a1, s0, t6
1150+
; RV32-NEXT: add s4, s3, s4
1151+
; RV32-NEXT: sltu a1, s4, s3
11321152
; RV32-NEXT: add a1, a1, s11
11331153
; RV32-NEXT: add a0, a0, a1
1134-
; RV32-NEXT: add s1, s0, s1
1135-
; RV32-NEXT: sltu a1, s1, s0
1136-
; RV32-NEXT: add a1, a1, ra
1137-
; RV32-NEXT: add a0, a0, a1
11381154
; RV32-NEXT: vmv.x.s a1, v24
1139-
; RV32-NEXT: add a2, s1, a2
1140-
; RV32-NEXT: sltu a3, a2, s1
1155+
; RV32-NEXT: add a2, s4, a2
1156+
; RV32-NEXT: sltu a3, a2, s4
11411157
; RV32-NEXT: add a1, a3, a1
11421158
; RV32-NEXT: vmv.x.s a3, v16
11431159
; RV32-NEXT: add a0, a0, a1

0 commit comments

Comments
 (0)