Skip to content

Commit febbf91

Browse files
authored
[RISCV] Match vcompress during shuffle lowering (#117748)
This change matches a subset of vcompress patterns during shuffle lowering. The subset implemented requires a contiguous prefix of demanded elements followed by undefs. This subset was chosen for two reasons: 1) which elements to spurious demand is a non-obvious problem, and 2) my first several attempts at implementing the general case were buggy. I decided to go with the simple case to start with. vcompress scales better with LMUL than a general vrgather, and at least the SpaceMit X60, has higher throughput even at m1. It also has the advantage of requiring smaller vector constants at one bit per element as opposed to vrgather which is a minimum of 8 bits per element. The downside to using vcompress is that we can't fold a vselect into it, as there is no masked vcompress variant. For reference, here are the relevant throughputs from camel-cdr's data table on BP3 (X60): vrgather.vv v8,v16,v24 4.0 16.0 64.0 256.0 vcompress.vm v8,v16,v24 3.0 10.0 36.0 136. vmerge.vvm v8,v16,v24,v0 2.0 4.0 8.0 16.0 The largest concern with the extra vmerge is that we locally increase register pressure. If we do have masking, we also have a passthru, without the ability to fold that into the vcompress, we need to keep it alive a bit longer. This can hurt at e.g. m8 where we have very few architectural registers. As compared with the vrgather.vv sequence, this is only one additional m1 VREG - since we no longer need the index vector. It compares slightly worse against vrgatherie16.vv which can use index vectors smaller than other operands. Note that we could potentially fold the vmerge if only tail elements are being preserved; I haven't investigated this. It is unfortunately hard given our current lowering structure to know if we're emitting a shuffle where masking will follow. Thankfully, it doesn't seem to show up much in practice, so I think we can probably ignore it. This patch only handles single source compress idioms at the moment. This is an effort to avoid interacting with other patches on review for changing how we canonicalize length changing shuffles.
1 parent 1669ac4 commit febbf91

8 files changed

+796
-711
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5155,6 +5155,28 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
51555155
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
51565156
}
51575157

5158+
// Matches a subset of compress masks with a contiguous prefix of output
5159+
// elements. This could be extended to allow gaps by deciding which
5160+
// source elements to spuriously demand.
5161+
static bool isCompressMask(ArrayRef<int> Mask) {
5162+
int Last = -1;
5163+
bool SawUndef = false;
5164+
for (int i = 0; i < Mask.size(); i++) {
5165+
if (Mask[i] == -1) {
5166+
SawUndef = true;
5167+
continue;
5168+
}
5169+
if (SawUndef)
5170+
return false;
5171+
if (i > Mask[i])
5172+
return false;
5173+
if (Mask[i] <= Last)
5174+
return false;
5175+
Last = Mask[i];
5176+
}
5177+
return true;
5178+
}
5179+
51585180
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
51595181
const RISCVSubtarget &Subtarget) {
51605182
SDValue V1 = Op.getOperand(0);
@@ -5372,6 +5394,25 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
53725394
if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
53735395
return V;
53745396

5397+
// Can we generate a vcompress instead of a vrgather? These scale better
5398+
// at high LMUL, at the cost of not being able to fold a following select
5399+
// into them. The mask constants are also smaller than the index vector
5400+
// constants, and thus easier to materialize.
5401+
if (isCompressMask(Mask)) {
5402+
SmallVector<SDValue> MaskVals(NumElts,
5403+
DAG.getConstant(false, DL, XLenVT));
5404+
for (auto Idx : Mask) {
5405+
if (Idx == -1)
5406+
break;
5407+
assert(Idx >= 0 && (unsigned)Idx < NumElts);
5408+
MaskVals[Idx] = DAG.getConstant(true, DL, XLenVT);
5409+
}
5410+
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5411+
SDValue CompressMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
5412+
return DAG.getNode(ISD::VECTOR_COMPRESS, DL, VT, V1, CompressMask,
5413+
DAG.getUNDEF(VT));
5414+
}
5415+
53755416
if (VT.getScalarSizeInBits() == 8 &&
53765417
any_of(Mask, [&](const auto &Idx) { return Idx > 255; })) {
53775418
// On such a vector we're unable to use i8 as the index type.

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,16 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x,
4040
; CHECK-LABEL: hang_when_merging_stores_after_legalization:
4141
; CHECK: # %bb.0:
4242
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
43-
; CHECK-NEXT: vid.v v12
43+
; CHECK-NEXT: vmv.v.i v12, -14
44+
; CHECK-NEXT: vid.v v14
4445
; CHECK-NEXT: li a0, 7
46+
; CHECK-NEXT: vmadd.vx v14, a0, v12
47+
; CHECK-NEXT: li a0, 129
48+
; CHECK-NEXT: vmv.s.x v15, a0
4549
; CHECK-NEXT: vmv.v.i v0, 12
46-
; CHECK-NEXT: vmul.vx v14, v12, a0
47-
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
48-
; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
49-
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
50-
; CHECK-NEXT: vadd.vi v8, v14, -14
5150
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
52-
; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t
51+
; CHECK-NEXT: vcompress.vm v12, v8, v15
52+
; CHECK-NEXT: vrgatherei16.vv v12, v10, v14, v0.t
5353
; CHECK-NEXT: vmv1r.v v8, v12
5454
; CHECK-NEXT: ret
5555
%z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> <i32 0, i32 7, i32 8, i32 15>

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -138,17 +138,17 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
138138
define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
139139
; CHECK-LABEL: vrgather_shuffle_vx_v4f64:
140140
; CHECK: # %bb.0:
141-
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
142-
; CHECK-NEXT: vid.v v10
143141
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
144142
; CHECK-NEXT: fld fa5, %lo(.LCPI9_0)(a0)
145-
; CHECK-NEXT: li a0, 3
146-
; CHECK-NEXT: vmul.vx v12, v10, a0
143+
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
144+
; CHECK-NEXT: vmv.v.i v10, 9
145+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
146+
; CHECK-NEXT: vcompress.vm v12, v8, v10
147+
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
147148
; CHECK-NEXT: vmv.v.i v0, 3
148-
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu
149-
; CHECK-NEXT: vfmv.v.f v10, fa5
150-
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12, v0.t
151-
; CHECK-NEXT: vmv.v.v v8, v10
149+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
150+
; CHECK-NEXT: vfmv.v.f v8, fa5
151+
; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
152152
; CHECK-NEXT: ret
153153
%s = shufflevector <4 x double> %x, <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
154154
ret <4 x double> %s

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 29 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -113,14 +113,12 @@ define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) {
113113
define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) {
114114
; CHECK-LABEL: vrgather_shuffle_vx_v4i16:
115115
; CHECK: # %bb.0:
116-
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
117-
; CHECK-NEXT: vid.v v9
118-
; CHECK-NEXT: li a0, 3
116+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
117+
; CHECK-NEXT: vmv.v.i v9, 9
119118
; CHECK-NEXT: vmv.v.i v0, 3
120-
; CHECK-NEXT: vmul.vx v10, v9, a0
121-
; CHECK-NEXT: vmv.v.i v9, 5
122-
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
123-
; CHECK-NEXT: vmv1r.v v8, v9
119+
; CHECK-NEXT: vcompress.vm v10, v8, v9
120+
; CHECK-NEXT: vmv.v.i v8, 5
121+
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
124122
; CHECK-NEXT: ret
125123
%s = shufflevector <4 x i16> %x, <4 x i16> <i16 5, i16 5, i16 5, i16 5>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
126124
ret <4 x i16> %s
@@ -723,21 +721,22 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
723721
define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) {
724722
; CHECK-LABEL: shuffle_v64i8_v8i8:
725723
; CHECK: # %bb.0:
726-
; CHECK-NEXT: li a0, 32
724+
; CHECK-NEXT: lui a0, 4112
727725
; CHECK-NEXT: li a1, 240
728726
; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
729727
; CHECK-NEXT: vmv.s.x v0, a1
730-
; CHECK-NEXT: lui a1, 98561
731-
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
732-
; CHECK-NEXT: vid.v v12
733-
; CHECK-NEXT: vsll.vi v14, v12, 3
734-
; CHECK-NEXT: vrgather.vv v12, v8, v14
735-
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
736-
; CHECK-NEXT: vslidedown.vx v8, v8, a0
737-
; CHECK-NEXT: addi a1, a1, -2048
728+
; CHECK-NEXT: li a1, 32
729+
; CHECK-NEXT: addi a0, a0, 257
730+
; CHECK-NEXT: vmv.s.x v14, a0
731+
; CHECK-NEXT: lui a0, 98561
732+
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
733+
; CHECK-NEXT: vcompress.vm v12, v8, v14
734+
; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
735+
; CHECK-NEXT: vslidedown.vx v8, v8, a1
736+
; CHECK-NEXT: addi a0, a0, -2048
738737
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
739-
; CHECK-NEXT: vmv.v.x v10, a1
740-
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
738+
; CHECK-NEXT: vmv.v.x v10, a0
739+
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu
741740
; CHECK-NEXT: vrgather.vv v12, v8, v10, v0.t
742741
; CHECK-NEXT: vmv1r.v v8, v12
743742
; CHECK-NEXT: ret
@@ -748,11 +747,10 @@ define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) {
748747
define <8 x i8> @shuffle_compress_singlesrc_e8(<8 x i8> %v) {
749748
; CHECK-LABEL: shuffle_compress_singlesrc_e8:
750749
; CHECK: # %bb.0:
751-
; CHECK-NEXT: lui a0, %hi(.LCPI49_0)
752-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI49_0)
750+
; CHECK-NEXT: li a0, 181
753751
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
754-
; CHECK-NEXT: vle8.v v10, (a0)
755-
; CHECK-NEXT: vrgather.vv v9, v8, v10
752+
; CHECK-NEXT: vmv.s.x v10, a0
753+
; CHECK-NEXT: vcompress.vm v9, v8, v10
756754
; CHECK-NEXT: vmv1r.v v8, v9
757755
; CHECK-NEXT: ret
758756
%out = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 5, i32 7, i32 undef, i32 undef, i32 undef>
@@ -762,11 +760,10 @@ define <8 x i8> @shuffle_compress_singlesrc_e8(<8 x i8> %v) {
762760
define <8 x i16> @shuffle_compress_singlesrc_e16(<8 x i16> %v) {
763761
; CHECK-LABEL: shuffle_compress_singlesrc_e16:
764762
; CHECK: # %bb.0:
765-
; CHECK-NEXT: lui a0, %hi(.LCPI50_0)
766-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI50_0)
763+
; CHECK-NEXT: li a0, 181
767764
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
768-
; CHECK-NEXT: vle16.v v10, (a0)
769-
; CHECK-NEXT: vrgather.vv v9, v8, v10
765+
; CHECK-NEXT: vmv.s.x v10, a0
766+
; CHECK-NEXT: vcompress.vm v9, v8, v10
770767
; CHECK-NEXT: vmv.v.v v8, v9
771768
; CHECK-NEXT: ret
772769
%out = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 5, i32 7, i32 undef, i32 undef, i32 undef>
@@ -776,11 +773,10 @@ define <8 x i16> @shuffle_compress_singlesrc_e16(<8 x i16> %v) {
776773
define <8 x i32> @shuffle_compress_singlesrc_e32(<8 x i32> %v) {
777774
; CHECK-LABEL: shuffle_compress_singlesrc_e32:
778775
; CHECK: # %bb.0:
779-
; CHECK-NEXT: lui a0, %hi(.LCPI51_0)
780-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI51_0)
776+
; CHECK-NEXT: li a0, 115
781777
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
782-
; CHECK-NEXT: vle16.v v12, (a0)
783-
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
778+
; CHECK-NEXT: vmv.s.x v12, a0
779+
; CHECK-NEXT: vcompress.vm v10, v8, v12
784780
; CHECK-NEXT: vmv.v.v v8, v10
785781
; CHECK-NEXT: ret
786782
%out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef>
@@ -790,11 +786,10 @@ define <8 x i32> @shuffle_compress_singlesrc_e32(<8 x i32> %v) {
790786
define <8 x i64> @shuffle_compress_singlesrc_e64(<8 x i64> %v) {
791787
; CHECK-LABEL: shuffle_compress_singlesrc_e64:
792788
; CHECK: # %bb.0:
793-
; CHECK-NEXT: lui a0, %hi(.LCPI52_0)
794-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI52_0)
789+
; CHECK-NEXT: li a0, 181
795790
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
796-
; CHECK-NEXT: vle16.v v16, (a0)
797-
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
791+
; CHECK-NEXT: vmv.s.x v16, a0
792+
; CHECK-NEXT: vcompress.vm v12, v8, v16
798793
; CHECK-NEXT: vmv.v.v v8, v12
799794
; CHECK-NEXT: ret
800795
%out = shufflevector <8 x i64> %v, <8 x i64> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 5, i32 7, i32 undef, i32 undef, i32 undef>

0 commit comments

Comments
 (0)