Skip to content

Commit 2af2634

Browse files
authored
[RISCV] Use vcompress in deinterleave2 intrinsic lowering (#118325)
This is analogous to febbf91 which added shuffle lowering using vcompress; we can do the same thing in the deinterleave2 lowering path which is used for scalable vectors. Note that we can further improve this for high lmul usage by adjusting how we materialize the mask (whose result is at most m1 with a known bit pattern). I am deliberately staging the work so that the changes to reduce register pressure are more easily evaluated on their own merit.
1 parent fc9052e commit 2af2634

File tree

3 files changed

+101
-165
lines changed

3 files changed

+101
-165
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -10736,10 +10736,6 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
1073610736
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT,
1073710737
Op.getOperand(0), Op.getOperand(1));
1073810738

10739-
// We want to operate on all lanes, so get the mask and VL and mask for it
10740-
auto [Mask, VL] = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget);
10741-
SDValue Passthru = DAG.getUNDEF(ConcatVT);
10742-
1074310739
// We can deinterleave through vnsrl.wi if the element type is smaller than
1074410740
// ELEN
1074510741
if (VecVT.getScalarSizeInBits() < Subtarget.getELen()) {
@@ -10749,19 +10745,28 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
1074910745
}
1075010746

1075110747
// For the indices, use the same SEW to avoid an extra vsetvli
10748+
// TODO: If container type is larger than m1, we can consider using a splat
10749+
// of a constant instead of the following sequence
10750+
10751+
// Create a vector of even indices {0, 1, 2, ...}
1075210752
MVT IdxVT = ConcatVT.changeVectorElementTypeToInteger();
10753-
// Create a vector of even indices {0, 2, 4, ...}
10754-
SDValue EvenIdx =
10755-
DAG.getStepVector(DL, IdxVT, APInt(IdxVT.getScalarSizeInBits(), 2));
10756-
// Create a vector of odd indices {1, 3, 5, ... }
10757-
SDValue OddIdx =
10758-
DAG.getNode(ISD::ADD, DL, IdxVT, EvenIdx, DAG.getConstant(1, DL, IdxVT));
10759-
10760-
// Gather the even and odd elements into two separate vectors
10761-
SDValue EvenWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT,
10762-
Concat, EvenIdx, Passthru, Mask, VL);
10763-
SDValue OddWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT,
10764-
Concat, OddIdx, Passthru, Mask, VL);
10753+
SDValue StepVec = DAG.getStepVector(DL, IdxVT);
10754+
// 0, 1, 0, 1, 0, 1
10755+
SDValue ZeroOnes =
10756+
DAG.getNode(ISD::AND, DL, IdxVT, StepVec, DAG.getConstant(1, DL, IdxVT));
10757+
MVT MaskVT = ConcatVT.changeVectorElementType(MVT::i1);
10758+
SDValue EvenMask =
10759+
DAG.getSetCC(DL, MaskVT, ZeroOnes, DAG.getConstant(0, DL, IdxVT),
10760+
ISD::CondCode::SETEQ);
10761+
// Have the latter be the not of the former to minimize the live range of
10762+
// the index vector since that might be large.
10763+
SDValue OddMask = DAG.getLogicalNOT(DL, EvenMask, MaskVT);
10764+
10765+
// vcompress the even and odd elements into two separate vectors
10766+
SDValue EvenWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
10767+
EvenMask, DAG.getUNDEF(ConcatVT));
10768+
SDValue OddWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
10769+
OddMask, DAG.getUNDEF(ConcatVT));
1076510770

1076610771
// Extract the result half of the gather for even and odd
1076710772
SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide,

llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll

Lines changed: 18 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -106,95 +106,55 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
106106
; CHECK-NEXT: addi sp, sp, -16
107107
; CHECK-NEXT: .cfi_def_cfa_offset 16
108108
; CHECK-NEXT: csrr a1, vlenb
109-
; CHECK-NEXT: li a2, 40
109+
; CHECK-NEXT: li a2, 24
110110
; CHECK-NEXT: mul a1, a1, a2
111111
; CHECK-NEXT: sub sp, sp, a1
112-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
112+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
113113
; CHECK-NEXT: csrr a1, vlenb
114114
; CHECK-NEXT: vl8re64.v v16, (a0)
115115
; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma
116116
; CHECK-NEXT: vid.v v8
117117
; CHECK-NEXT: slli a1, a1, 3
118-
; CHECK-NEXT: vadd.vv v24, v8, v8
119-
; CHECK-NEXT: csrr a2, vlenb
120-
; CHECK-NEXT: slli a2, a2, 4
121-
; CHECK-NEXT: add a2, sp, a2
122-
; CHECK-NEXT: addi a2, a2, 16
123-
; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
118+
; CHECK-NEXT: vand.vi v8, v8, 1
124119
; CHECK-NEXT: add a0, a0, a1
120+
; CHECK-NEXT: vmseq.vi v24, v8, 0
125121
; CHECK-NEXT: vl8re64.v v8, (a0)
126122
; CHECK-NEXT: csrr a0, vlenb
127-
; CHECK-NEXT: slli a0, a0, 5
128-
; CHECK-NEXT: add a0, sp, a0
129-
; CHECK-NEXT: addi a0, a0, 16
130-
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
131-
; CHECK-NEXT: vadd.vi v8, v24, 1
132-
; CHECK-NEXT: csrr a0, vlenb
133-
; CHECK-NEXT: li a1, 24
134-
; CHECK-NEXT: mul a0, a0, a1
123+
; CHECK-NEXT: slli a0, a0, 4
135124
; CHECK-NEXT: add a0, sp, a0
136125
; CHECK-NEXT: addi a0, a0, 16
137126
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
138-
; CHECK-NEXT: vrgather.vv v8, v16, v24
139-
; CHECK-NEXT: csrr a0, vlenb
140-
; CHECK-NEXT: li a1, 24
141-
; CHECK-NEXT: mul a0, a0, a1
142-
; CHECK-NEXT: add a0, sp, a0
143-
; CHECK-NEXT: addi a0, a0, 16
144-
; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
145-
; CHECK-NEXT: vrgather.vv v24, v16, v0
146-
; CHECK-NEXT: csrr a0, vlenb
147-
; CHECK-NEXT: slli a0, a0, 3
148-
; CHECK-NEXT: add a0, sp, a0
149-
; CHECK-NEXT: addi a0, a0, 16
150-
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
151-
; CHECK-NEXT: csrr a0, vlenb
152-
; CHECK-NEXT: slli a0, a0, 5
153-
; CHECK-NEXT: add a0, sp, a0
154-
; CHECK-NEXT: addi a0, a0, 16
155-
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
127+
; CHECK-NEXT: vmnot.m v6, v24
128+
; CHECK-NEXT: vcompress.vm v8, v16, v24
129+
; CHECK-NEXT: vmv1r.v v13, v24
130+
; CHECK-NEXT: vcompress.vm v24, v16, v6
131+
; CHECK-NEXT: vmv1r.v v12, v6
156132
; CHECK-NEXT: csrr a0, vlenb
157133
; CHECK-NEXT: slli a0, a0, 4
158134
; CHECK-NEXT: add a0, sp, a0
159135
; CHECK-NEXT: addi a0, a0, 16
160-
; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
161-
; CHECK-NEXT: vrgather.vv v24, v16, v0
136+
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
137+
; CHECK-NEXT: vcompress.vm v0, v16, v13
162138
; CHECK-NEXT: addi a0, sp, 16
163-
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
164-
; CHECK-NEXT: csrr a0, vlenb
165-
; CHECK-NEXT: li a1, 24
166-
; CHECK-NEXT: mul a0, a0, a1
167-
; CHECK-NEXT: add a0, sp, a0
168-
; CHECK-NEXT: addi a0, a0, 16
169-
; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
170-
; CHECK-NEXT: csrr a0, vlenb
171-
; CHECK-NEXT: slli a0, a0, 5
172-
; CHECK-NEXT: add a0, sp, a0
173-
; CHECK-NEXT: addi a0, a0, 16
174-
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
175-
; CHECK-NEXT: vrgather.vv v16, v24, v0
139+
; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
140+
; CHECK-NEXT: vcompress.vm v0, v16, v12
176141
; CHECK-NEXT: csrr a0, vlenb
177-
; CHECK-NEXT: slli a0, a0, 4
142+
; CHECK-NEXT: slli a0, a0, 3
178143
; CHECK-NEXT: add a0, sp, a0
179144
; CHECK-NEXT: addi a0, a0, 16
180-
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
145+
; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
181146
; CHECK-NEXT: addi a0, sp, 16
182147
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
183148
; CHECK-NEXT: vmv4r.v v12, v16
184149
; CHECK-NEXT: csrr a0, vlenb
185-
; CHECK-NEXT: slli a0, a0, 4
186-
; CHECK-NEXT: add a0, sp, a0
187-
; CHECK-NEXT: addi a0, a0, 16
188-
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
189-
; CHECK-NEXT: csrr a0, vlenb
190150
; CHECK-NEXT: slli a0, a0, 3
191151
; CHECK-NEXT: add a0, sp, a0
192152
; CHECK-NEXT: addi a0, a0, 16
193-
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
153+
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
194154
; CHECK-NEXT: vmv4r.v v28, v16
195155
; CHECK-NEXT: vmv8r.v v16, v24
196156
; CHECK-NEXT: csrr a0, vlenb
197-
; CHECK-NEXT: li a1, 40
157+
; CHECK-NEXT: li a1, 24
198158
; CHECK-NEXT: mul a0, a0, a1
199159
; CHECK-NEXT: add sp, sp, a0
200160
; CHECK-NEXT: .cfi_def_cfa sp, 16

0 commit comments

Comments
 (0)