Skip to content

Commit 0ce99e4

Browse files
committed
[RISCV] Merge shuffle sources if lanes are disjoint
In x264, there's a few kernels with shuffles like this: %41 = add nsw <16 x i32> %39, %40 %42 = sub nsw <16 x i32> %39, %40 %43 = shufflevector <16 x i32> %41, <16 x i32> %42, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16> Because this is a complex two-source shuffle, this will get lowered as two vrgather.vvs that are blended together. vadd.vv v20, v16, v12 vsub.vv v12, v16, v12 vrgatherei16.vv v24, v20, v10 vrgatherei16.vv v24, v12, v16, v0.t However the indices coming from each source are disjoint, so we can blend the two together and perform a single source shuffle instead: %41 = add nsw <16 x i32> %39, %40 %42 = sub nsw <16 x i32> %39, %40 %43 = select <0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1> %41, %42 %44 = shufflevector <16 x i32> %43, <16 x i32> poison, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 10, i32 14, i32 6, i32 2, i32 9, i32 13, i32 5, i32 1, i32 8, i32 12, i32 4, i32 0> The select will likely get merged into the preceding instruction, and then we only have to do one vrgather.vv: vadd.vv v20, v16, v12 vsub.vv v20, v16, v12, v0.t vrgatherei16.vv v24, v20, v10 This patch bails if either of the sources are a splat however, since that will usually already have some sort of cheaper lowering via vrgather.vi. This improves performance on 525.x264_r by 4.12% with -O3 -flto -march=rva22u64_v on the spacemit-x60: https://lnt.lukelau.me/db_default/v4/nts/71?compare_to=70
1 parent 36eb019 commit 0ce99e4

6 files changed

+901
-767
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+71
Original file line numberDiff line numberDiff line change
@@ -5197,6 +5197,67 @@ static bool isCompressMask(ArrayRef<int> Mask) {
51975197
return true;
51985198
}
51995199

5200+
/// Given a shuffle where the indices are disjoint between the two sources,
5201+
/// e.g.:
5202+
///
5203+
/// t2:v4i8 = vector_shuffle t0:v4i8, t1:v4i8, <2, 7, 1, 4>
5204+
///
5205+
/// Merge the two sources into one and do a single source shuffle:
5206+
///
5207+
/// t2:v4i8 = vselect t1:v4i8, t0:v4i8, <0, 1, 0, 1>
5208+
/// t3:v4i8 = vector_shuffle t2:v4i8, undef, <2, 3, 1, 0>
5209+
///
5210+
/// A vselect will either be merged into a masked instruction or be lowered as a
5211+
/// vmerge.vvm, which is cheaper than a vrgather.vv.
5212+
static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
5213+
SelectionDAG &DAG,
5214+
const RISCVSubtarget &Subtarget) {
5215+
MVT VT = SVN->getSimpleValueType(0);
5216+
MVT XLenVT = Subtarget.getXLenVT();
5217+
SDLoc DL(SVN);
5218+
5219+
const ArrayRef<int> Mask = SVN->getMask();
5220+
5221+
// Work out which source each lane will come from.
5222+
SmallVector<int, 16> Srcs(Mask.size(), -1);
5223+
5224+
for (int Idx : Mask) {
5225+
if (Idx == -1)
5226+
continue;
5227+
unsigned SrcIdx = Idx % Mask.size();
5228+
int Src = (uint32_t)Idx < Mask.size() ? 0 : 1;
5229+
if (Srcs[SrcIdx] == -1)
5230+
// Mark this source as using this lane.
5231+
Srcs[SrcIdx] = Src;
5232+
else if (Srcs[SrcIdx] != Src)
5233+
// The other source is using this lane: not disjoint.
5234+
return SDValue();
5235+
}
5236+
5237+
SmallVector<SDValue> SelectMaskVals;
5238+
for (int Lane : Srcs) {
5239+
if (Lane == -1)
5240+
SelectMaskVals.push_back(DAG.getUNDEF(XLenVT));
5241+
else
5242+
SelectMaskVals.push_back(DAG.getConstant(Lane, DL, XLenVT));
5243+
}
5244+
MVT MaskVT = VT.changeVectorElementType(MVT::i1);
5245+
SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, SelectMaskVals);
5246+
SDValue Select = DAG.getNode(ISD::VSELECT, DL, VT, SelectMask,
5247+
SVN->getOperand(1), SVN->getOperand(0));
5248+
5249+
// Move all indices relative to the first source.
5250+
SmallVector<int> NewMask(Mask.size());
5251+
for (unsigned I = 0; I < Mask.size(); I++) {
5252+
if (Mask[I] == -1)
5253+
NewMask[I] = -1;
5254+
else
5255+
NewMask[I] = Mask[I] % Mask.size();
5256+
}
5257+
5258+
return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask);
5259+
}
5260+
52005261
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
52015262
const RISCVSubtarget &Subtarget) {
52025263
SDValue V1 = Op.getOperand(0);
@@ -5540,6 +5601,16 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
55405601
ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts));
55415602
}
55425603

5604+
// If the mask indices are disjoint between the two sources, we can lower it
5605+
// as a vselect + a single source vrgather.vv. Don't do this if the operands
5606+
// will be splatted since they will be lowered to something cheaper like
5607+
// vrgather.vi anyway.
5608+
if (!DAG.isSplatValue(V2) && !DAG.isSplatValue(V1) &&
5609+
!ShuffleVectorSDNode::isSplatMask(ShuffleMaskLHS.data(), VT) &&
5610+
!ShuffleVectorSDNode::isSplatMask(ShuffleMaskRHS.data(), VT))
5611+
if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget))
5612+
return V;
5613+
55435614
// Try to pick a profitable operand order.
55445615
bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
55455616
SwapOps = SwapOps ^ ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts);

llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll

+58-45
Original file line numberDiff line numberDiff line change
@@ -104,60 +104,73 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) {
104104
; CHECK-NEXT: addi s0, sp, 1536
105105
; CHECK-NEXT: .cfi_def_cfa s0, 0
106106
; CHECK-NEXT: andi sp, sp, -512
107-
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
108-
; CHECK-NEXT: vmv8r.v v24, v8
107+
; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, ma
108+
; CHECK-NEXT: vmv.v.i v0, 0
109+
; CHECK-NEXT: lui a2, 16384
110+
; CHECK-NEXT: li a3, 1
109111
; CHECK-NEXT: li a0, 512
110112
; CHECK-NEXT: addi a1, sp, 512
111-
; CHECK-NEXT: vslidedown.vi v0, v24, 5
112-
; CHECK-NEXT: vmv.x.s a2, v24
113-
; CHECK-NEXT: li a3, 432
113+
; CHECK-NEXT: li a4, 43
114+
; CHECK-NEXT: slli a3, a3, 34
115+
; CHECK-NEXT: vmv.s.x v24, a3
116+
; CHECK-NEXT: li a3, 36
117+
; CHECK-NEXT: addi a2, a2, 129
118+
; CHECK-NEXT: slli a2, a2, 36
119+
; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma
120+
; CHECK-NEXT: vmv.s.x v0, a2
121+
; CHECK-NEXT: vsetivli zero, 3, e64, m1, tu, ma
122+
; CHECK-NEXT: vslideup.vi v0, v24, 2
123+
; CHECK-NEXT: li a2, 399
114124
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
115-
; CHECK-NEXT: vmv.v.x v8, a2
116-
; CHECK-NEXT: li a2, 431
117-
; CHECK-NEXT: vsetvli zero, a3, e8, m8, tu, ma
118-
; CHECK-NEXT: vslideup.vx v8, v0, a2
125+
; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
119126
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
120-
; CHECK-NEXT: vslidedown.vi v0, v24, 4
127+
; CHECK-NEXT: vslidedown.vx v8, v16, a4
128+
; CHECK-NEXT: vmv.x.s a4, v16
129+
; CHECK-NEXT: vslidedown.vx v24, v16, a3
130+
; CHECK-NEXT: vmv.x.s a3, v8
131+
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
132+
; CHECK-NEXT: vmv.v.x v8, a4
133+
; CHECK-NEXT: li a4, 398
134+
; CHECK-NEXT: vslide1down.vx v8, v8, a3
135+
; CHECK-NEXT: li a3, 432
136+
; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma
137+
; CHECK-NEXT: vslideup.vx v8, v24, a4
138+
; CHECK-NEXT: li a4, 431
121139
; CHECK-NEXT: li a2, 466
140+
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
141+
; CHECK-NEXT: vslidedown.vi v24, v16, 5
142+
; CHECK-NEXT: vsetvli zero, a3, e8, m8, tu, ma
143+
; CHECK-NEXT: vslideup.vx v8, v24, a4
122144
; CHECK-NEXT: li a3, 465
145+
; CHECK-NEXT: li a4, 62
123146
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
124-
; CHECK-NEXT: vse8.v v24, (a1)
125-
; CHECK-NEXT: lbu a1, 985(sp)
126-
; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma
127-
; CHECK-NEXT: vslideup.vx v8, v0, a3
128-
; CHECK-NEXT: li a2, 478
129-
; CHECK-NEXT: lbu a3, 1012(sp)
130-
; CHECK-NEXT: vmv.s.x v24, a1
131-
; CHECK-NEXT: li a1, 477
147+
; CHECK-NEXT: vse8.v v16, (a1)
148+
; CHECK-NEXT: li a0, 467
149+
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
150+
; CHECK-NEXT: vslidedown.vx v24, v16, a4
151+
; CHECK-NEXT: li a1, 478
152+
; CHECK-NEXT: vslidedown.vi v16, v16, 4
132153
; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma
133-
; CHECK-NEXT: vslideup.vx v8, v24, a1
134-
; CHECK-NEXT: li a1, 501
135-
; CHECK-NEXT: vmv.s.x v24, a3
136-
; CHECK-NEXT: li a2, 500
137-
; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma
154+
; CHECK-NEXT: vslideup.vx v8, v16, a3
155+
; CHECK-NEXT: lbu a3, 985(sp)
156+
; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma
138157
; CHECK-NEXT: vslideup.vx v8, v24, a2
139-
; CHECK-NEXT: lui a1, 2761
140-
; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, ma
141-
; CHECK-NEXT: vmv.v.i v24, 0
142-
; CHECK-NEXT: lui a2, 4
143-
; CHECK-NEXT: vmv.s.x v25, a2
144-
; CHECK-NEXT: lui a2, 1047552
145-
; CHECK-NEXT: addi a2, a2, 1
146-
; CHECK-NEXT: slli a2, a2, 23
147-
; CHECK-NEXT: addi a2, a2, 1
148-
; CHECK-NEXT: slli a2, a2, 18
149-
; CHECK-NEXT: vslide1down.vx v0, v24, a2
150-
; CHECK-NEXT: li a2, 64
151-
; CHECK-NEXT: slli a1, a1, 25
152-
; CHECK-NEXT: addi a1, a1, 501
153-
; CHECK-NEXT: slli a1, a1, 13
154-
; CHECK-NEXT: addi a1, a1, 512
155-
; CHECK-NEXT: vsetivli zero, 7, e64, m1, tu, ma
156-
; CHECK-NEXT: vslideup.vi v0, v25, 6
157-
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
158-
; CHECK-NEXT: vmv.v.x v24, a1
159-
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu
160-
; CHECK-NEXT: vrgather.vv v8, v16, v24, v0.t
158+
; CHECK-NEXT: li a0, 477
159+
; CHECK-NEXT: lbu a2, 1012(sp)
160+
; CHECK-NEXT: vmv.s.x v16, a3
161+
; CHECK-NEXT: lbu a3, 674(sp)
162+
; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma
163+
; CHECK-NEXT: vslideup.vx v8, v16, a0
164+
; CHECK-NEXT: vmv.s.x v24, a3
165+
; CHECK-NEXT: li a0, 490
166+
; CHECK-NEXT: vmv.s.x v16, a2
167+
; CHECK-NEXT: li a1, 489
168+
; CHECK-NEXT: li a2, 501
169+
; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma
170+
; CHECK-NEXT: vslideup.vx v8, v24, a1
171+
; CHECK-NEXT: li a0, 500
172+
; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma
173+
; CHECK-NEXT: vslideup.vx v8, v16, a0
161174
; CHECK-NEXT: addi sp, s0, -1536
162175
; CHECK-NEXT: .cfi_def_cfa sp, 1536
163176
; CHECK-NEXT: ld ra, 1528(sp) # 8-byte Folded Reload

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll

+11-12
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ define <4 x half> @shuffle_v4f16(<4 x half> %x, <4 x half> %y) {
2929
define <8 x float> @shuffle_v8f32(<8 x float> %x, <8 x float> %y) {
3030
; CHECK-LABEL: shuffle_v8f32:
3131
; CHECK: # %bb.0:
32-
; CHECK-NEXT: li a0, -20
32+
; CHECK-NEXT: li a0, 19
3333
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
3434
; CHECK-NEXT: vmv.s.x v0, a0
35-
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
35+
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
3636
; CHECK-NEXT: ret
3737
%s = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
3838
ret <8 x float> %s
@@ -401,17 +401,16 @@ define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) {
401401
; CHECK: # %bb.0:
402402
; CHECK-NEXT: lui a0, %hi(.LCPI30_0)
403403
; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0)
404-
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
405-
; CHECK-NEXT: vle16.v v20, (a0)
406-
; CHECK-NEXT: lui a0, %hi(.LCPI30_1)
407-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_1)
408-
; CHECK-NEXT: vle16.v v22, (a0)
409-
; CHECK-NEXT: lui a0, 15
410-
; CHECK-NEXT: addi a0, a0, 240
404+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
405+
; CHECK-NEXT: vle8.v v16, (a0)
406+
; CHECK-NEXT: lui a0, 5
407+
; CHECK-NEXT: addi a0, a0, 1365
411408
; CHECK-NEXT: vmv.s.x v0, a0
412-
; CHECK-NEXT: vrgatherei16.vv v16, v8, v20
413-
; CHECK-NEXT: vrgatherei16.vv v16, v12, v22, v0.t
414-
; CHECK-NEXT: vmv.v.v v8, v16
409+
; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
410+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
411+
; CHECK-NEXT: vsext.vf2 v18, v16
412+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
413+
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
415414
; CHECK-NEXT: ret
416415
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
417416
ret <16 x float> %out

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

+23-31
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ define <4 x i16> @shuffle_v4i16(<4 x i16> %x, <4 x i16> %y) {
1616
define <8 x i32> @shuffle_v8i32(<8 x i32> %x, <8 x i32> %y) {
1717
; CHECK-LABEL: shuffle_v8i32:
1818
; CHECK: # %bb.0:
19-
; CHECK-NEXT: li a0, 203
19+
; CHECK-NEXT: li a0, 52
2020
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
2121
; CHECK-NEXT: vmv.s.x v0, a0
22-
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
22+
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
2323
; CHECK-NEXT: ret
2424
%s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
2525
ret <8 x i32> %s
@@ -451,21 +451,14 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
451451
define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
452452
; CHECK-LABEL: splat_ve2_we0_ins_i2ve4_i5we6:
453453
; CHECK: # %bb.0:
454+
; CHECK-NEXT: lui a0, %hi(.LCPI26_0)
455+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0)
454456
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
455-
; CHECK-NEXT: vmv.v.i v10, 6
456-
; CHECK-NEXT: vmv.v.i v11, 0
457-
; CHECK-NEXT: lui a0, 8256
458-
; CHECK-NEXT: addi a0, a0, 2
459-
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
460-
; CHECK-NEXT: vmv.v.x v12, a0
461-
; CHECK-NEXT: li a0, 98
462-
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
463-
; CHECK-NEXT: vslideup.vi v11, v10, 5
457+
; CHECK-NEXT: vle8.v v10, (a0)
458+
; CHECK-NEXT: li a0, 65
464459
; CHECK-NEXT: vmv.s.x v0, a0
465-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
466-
; CHECK-NEXT: vrgather.vv v10, v8, v12
467-
; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t
468-
; CHECK-NEXT: vmv1r.v v8, v10
460+
; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0
461+
; CHECK-NEXT: vrgather.vv v8, v9, v10
469462
; CHECK-NEXT: ret
470463
%shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 2, i32 2, i32 14, i32 8, i32 2>
471464
ret <8 x i8> %shuff
@@ -693,12 +686,12 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
693686
; CHECK: # %bb.0:
694687
; CHECK-NEXT: lui a0, %hi(.LCPI46_0)
695688
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
696-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
689+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
697690
; CHECK-NEXT: vle8.v v10, (a0)
698-
; CHECK-NEXT: li a0, -22
691+
; CHECK-NEXT: li a0, 171
699692
; CHECK-NEXT: vmv.s.x v0, a0
700-
; CHECK-NEXT: vslidedown.vi v8, v8, 2
701-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
693+
; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0
694+
; CHECK-NEXT: vrgather.vv v8, v9, v10
702695
; CHECK-NEXT: ret
703696
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
704697
ret <8 x i8> %res
@@ -709,9 +702,9 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
709702
; CHECK-LABEL: shuffle_v8i32_2:
710703
; CHECK: # %bb.0:
711704
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
712-
; CHECK-NEXT: vmv.v.i v0, -13
705+
; CHECK-NEXT: vmv.v.i v0, 12
713706
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
714-
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
707+
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
715708
; CHECK-NEXT: ret
716709
%s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
717710
ret <8 x i32> %s
@@ -1027,17 +1020,16 @@ define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
10271020
; CHECK: # %bb.0:
10281021
; CHECK-NEXT: lui a0, %hi(.LCPI70_0)
10291022
; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_0)
1030-
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
1031-
; CHECK-NEXT: vle16.v v20, (a0)
1032-
; CHECK-NEXT: lui a0, %hi(.LCPI70_1)
1033-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_1)
1034-
; CHECK-NEXT: vle16.v v22, (a0)
1035-
; CHECK-NEXT: lui a0, 15
1036-
; CHECK-NEXT: addi a0, a0, 240
1023+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1024+
; CHECK-NEXT: vle8.v v16, (a0)
1025+
; CHECK-NEXT: lui a0, 5
1026+
; CHECK-NEXT: addi a0, a0, 1365
10371027
; CHECK-NEXT: vmv.s.x v0, a0
1038-
; CHECK-NEXT: vrgatherei16.vv v16, v8, v20
1039-
; CHECK-NEXT: vrgatherei16.vv v16, v12, v22, v0.t
1040-
; CHECK-NEXT: vmv.v.v v8, v16
1028+
; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
1029+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
1030+
; CHECK-NEXT: vsext.vf2 v18, v16
1031+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1032+
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
10411033
; CHECK-NEXT: ret
10421034
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
10431035
ret <16 x i32> %out

0 commit comments

Comments
 (0)