Skip to content

Commit 109038b

Browse files
committed
Try using LD1r.
1 parent 7e2bf68 commit 109038b

File tree

2 files changed

+75
-51
lines changed

2 files changed

+75
-51
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 57 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11012,6 +11012,48 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
1101211012
MaskSourceVec);
1101311013
}
1101411014

11015+
// Check if Op is a BUILD_VECTOR with 2 extracts and a load that is cheaper to
11016+
// insert into a vector and use a shuffle. This improves lowering for loads of
11017+
// <3 x i8>.
11018+
static SDValue shuffleWithSingleLoad(SDValue Op, SelectionDAG &DAG) {
11019+
if (Op.getNumOperands() != 4 || Op.getValueType() != MVT::v4i16)
11020+
return SDValue();
11021+
11022+
SDValue V0 = Op.getOperand(0);
11023+
SDValue V1 = Op.getOperand(1);
11024+
SDValue V2 = Op.getOperand(2);
11025+
SDValue V3 = Op.getOperand(3);
11026+
if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11027+
V1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11028+
V2.getOpcode() != ISD::LOAD || !(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT))
11029+
return SDValue();
11030+
11031+
if (V0.getOperand(0) != V1.getOperand(0) ||
11032+
V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || !(V3.isUndef() || V3.getConstantOperandVal(1) == 3))
11033+
return SDValue();
11034+
11035+
SDLoc dl(Op);
11036+
auto *L = cast<LoadSDNode>(Op.getOperand(2));
11037+
auto Vec = V0.getOperand(0);
11038+
11039+
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Vec.getValueType(), Vec,
11040+
SDValue(L, 0), DAG.getConstant(2, dl, MVT::i64));
11041+
Vec = DAG.getNode(ISD::BITCAST, dl, MVT::v4i16, Vec);
11042+
11043+
SDValue ShuffleOps[] = {DAG.getUNDEF(MVT::v4i16), DAG.getUNDEF(MVT::v4i16)};
11044+
ShuffleOps[0] = Vec;
11045+
11046+
SmallVector<int, 8> Mask(4, -1);
11047+
Mask[0] = 0;
11048+
Mask[1] = 1;
11049+
Mask[2] = 2;
11050+
if (!V3.isUndef())
11051+
Mask[3] = 3;
11052+
SDValue Shuffle =
11053+
DAG.getVectorShuffle(MVT::v4i16, dl, ShuffleOps[0], ShuffleOps[1], Mask);
11054+
return Shuffle;
11055+
}
11056+
1101511057
// Gather data to see if the operation can be modelled as a
1101611058
// shuffle in combination with VEXTs.
1101711059
SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
@@ -11022,6 +11064,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
1102211064
EVT VT = Op.getValueType();
1102311065
assert(!VT.isScalableVector() &&
1102411066
"Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11067+
11068+
if (SDValue S = shuffleWithSingleLoad(Op, DAG))
11069+
return S;
11070+
1102511071
unsigned NumElts = VT.getVectorNumElements();
1102611072

1102711073
struct ShuffleSourceInfo {
@@ -11048,6 +11094,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
1104811094

1104911095
// First gather all vectors used as an immediate source for this BUILD_VECTOR
1105011096
// node.
11097+
//
1105111098
SmallVector<ShuffleSourceInfo, 2> Sources;
1105211099
for (unsigned i = 0; i < NumElts; ++i) {
1105311100
SDValue V = Op.getOperand(i);
@@ -21269,24 +21316,23 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
2126921316
assert(LD->getOffset().isUndef() && "undef offset expected");
2127021317

2127121318
// Load 2 x i8, then 1 x i8.
21272-
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21319+
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr,
21320+
MF.getMachineMemOperand(MMO, 0, 2));
2127321321
TypeSize Offset2 = TypeSize::getFixed(2);
2127421322
SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
2127521323
DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
2127621324
MF.getMachineMemOperand(MMO, 2, 1));
2127721325

21278-
// Extend to i32.
21279-
SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21280-
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21326+
SDValue Ins16 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::v4i16, L16);
2128121327

21282-
// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21283-
SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21284-
DAG.getConstant(16, DL, MVT::i32));
21285-
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr);
21286-
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21328+
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Ins16);
21329+
21330+
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21331+
SDValue Trunc8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Ext8);
2128721332

21288-
// Extract v3i8 again.
21289-
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21333+
SDValue Ins8 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i8, Cast,
21334+
Trunc8, DAG.getConstant(2, DL, MVT::i64));
21335+
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Ins8,
2129021336
DAG.getConstant(0, DL, MVT::i64));
2129121337
SDValue TokenFactor = DAG.getNode(
2129221338
ISD::TokenFactor, DL, MVT::Other,

llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll

Lines changed: 18 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@
55
define <16 x i8> @load_v3i8(ptr %src) {
66
; CHECK-LABEL: load_v3i8:
77
; CHECK: ; %bb.0:
8-
; CHECK-NEXT: ldrb w8, [x0, #2]
9-
; CHECK-NEXT: ldrh w9, [x0]
10-
; CHECK-NEXT: orr w8, w9, w8, lsl #16
11-
; CHECK-NEXT: fmov s0, w8
8+
; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
9+
; CHECK-NEXT: ld1.b { v0 }[2], [x0]
1210
; CHECK-NEXT: ret
1311
;
1412
; BE-LABEL: load_v3i8:
@@ -38,12 +36,9 @@ define <16 x i8> @load_v3i8(ptr %src) {
3836
define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
3937
; CHECK-LABEL: load_v3i8_to_4xi32:
4038
; CHECK: ; %bb.0:
41-
; CHECK-NEXT: ldrb w8, [x0, #2]
42-
; CHECK-NEXT: ldrh w9, [x0]
39+
; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
4340
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
44-
; CHECK-NEXT: orr w8, w9, w8, lsl #16
45-
; CHECK-NEXT: fmov s0, w8
46-
; CHECK-NEXT: zip1.8b v0, v0, v0
41+
; CHECK-NEXT: ld1.b { v0 }[2], [x0]
4742
; CHECK-NEXT: ushll.4s v0, v0, #0
4843
; CHECK-NEXT: and.16b v0, v0, v1
4944
; CHECK-NEXT: ret
@@ -59,7 +54,6 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
5954
; BE-NEXT: ldrsb w8, [x0, #2]
6055
; BE-NEXT: rev32 v0.8b, v0.8b
6156
; BE-NEXT: ushll v0.8h, v0.8b, #0
62-
; BE-NEXT: mov v0.h[1], v0.h[1]
6357
; BE-NEXT: mov v0.h[2], w8
6458
; BE-NEXT: ushll v0.4s, v0.4h, #0
6559
; BE-NEXT: and v0.16b, v0.16b, v1.16b
@@ -76,12 +70,9 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
7670
define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) {
7771
; CHECK-LABEL: load_v3i8_to_4xi32_align_2:
7872
; CHECK: ; %bb.0:
79-
; CHECK-NEXT: ldrb w8, [x0, #2]
80-
; CHECK-NEXT: ldrh w9, [x0]
73+
; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
8174
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
82-
; CHECK-NEXT: orr w8, w9, w8, lsl #16
83-
; CHECK-NEXT: fmov s0, w8
84-
; CHECK-NEXT: zip1.8b v0, v0, v0
75+
; CHECK-NEXT: ld1.b { v0 }[2], [x0]
8576
; CHECK-NEXT: ushll.4s v0, v0, #0
8677
; CHECK-NEXT: and.16b v0, v0, v1
8778
; CHECK-NEXT: ret
@@ -97,7 +88,6 @@ define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) {
9788
; BE-NEXT: ldrsb w8, [x0, #2]
9889
; BE-NEXT: rev32 v0.8b, v0.8b
9990
; BE-NEXT: ushll v0.8h, v0.8b, #0
100-
; BE-NEXT: mov v0.h[1], v0.h[1]
10191
; BE-NEXT: mov v0.h[2], w8
10292
; BE-NEXT: ushll v0.4s, v0.4h, #0
10393
; BE-NEXT: and v0.16b, v0.16b, v1.16b
@@ -141,12 +131,11 @@ define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) {
141131
define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
142132
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1:
143133
; CHECK: ; %bb.0:
144-
; CHECK-NEXT: ldrb w8, [x0, #3]
145-
; CHECK-NEXT: ldurh w9, [x0, #1]
134+
; CHECK-NEXT: add x8, x0, #1
146135
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
147-
; CHECK-NEXT: orr w8, w9, w8, lsl #16
148-
; CHECK-NEXT: fmov s0, w8
149-
; CHECK-NEXT: zip1.8b v0, v0, v0
136+
; CHECK-NEXT: ld1r.4h { v0 }, [x8]
137+
; CHECK-NEXT: add x8, x0, #3
138+
; CHECK-NEXT: ld1.b { v0 }[2], [x8]
150139
; CHECK-NEXT: ushll.4s v0, v0, #0
151140
; CHECK-NEXT: and.16b v0, v0, v1
152141
; CHECK-NEXT: ret
@@ -162,7 +151,6 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
162151
; BE-NEXT: ldrsb w8, [x0, #3]
163152
; BE-NEXT: rev32 v0.8b, v0.8b
164153
; BE-NEXT: ushll v0.8h, v0.8b, #0
165-
; BE-NEXT: mov v0.h[1], v0.h[1]
166154
; BE-NEXT: mov v0.h[2], w8
167155
; BE-NEXT: ushll v0.4s, v0.4h, #0
168156
; BE-NEXT: and v0.16b, v0.16b, v1.16b
@@ -180,12 +168,11 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
180168
define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
181169
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3:
182170
; CHECK: ; %bb.0:
183-
; CHECK-NEXT: ldrb w8, [x0, #5]
184-
; CHECK-NEXT: ldurh w9, [x0, #3]
171+
; CHECK-NEXT: add x8, x0, #3
185172
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
186-
; CHECK-NEXT: orr w8, w9, w8, lsl #16
187-
; CHECK-NEXT: fmov s0, w8
188-
; CHECK-NEXT: zip1.8b v0, v0, v0
173+
; CHECK-NEXT: ld1r.4h { v0 }, [x8]
174+
; CHECK-NEXT: add x8, x0, #5
175+
; CHECK-NEXT: ld1.b { v0 }[2], [x8]
189176
; CHECK-NEXT: ushll.4s v0, v0, #0
190177
; CHECK-NEXT: and.16b v0, v0, v1
191178
; CHECK-NEXT: ret
@@ -201,7 +188,6 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
201188
; BE-NEXT: ldrsb w8, [x0, #5]
202189
; BE-NEXT: rev32 v0.8b, v0.8b
203190
; BE-NEXT: ushll v0.8h, v0.8b, #0
204-
; BE-NEXT: mov v0.h[1], v0.h[1]
205191
; BE-NEXT: mov v0.h[2], w8
206192
; BE-NEXT: ushll v0.4s, v0.4h, #0
207193
; BE-NEXT: and v0.16b, v0.16b, v1.16b
@@ -263,7 +249,6 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) {
263249
; CHECK-NEXT: ldr s0, [sp, #12]
264250
; CHECK-NEXT: ldrsb w8, [x0, #2]
265251
; CHECK-NEXT: ushll.8h v0, v0, #0
266-
; CHECK-NEXT: mov.h v0[1], v0[1]
267252
; CHECK-NEXT: mov.h v0[2], w8
268253
; CHECK-NEXT: ushll.4s v0, v0, #0
269254
; CHECK-NEXT: and.16b v0, v0, v1
@@ -281,7 +266,6 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) {
281266
; BE-NEXT: ldrsb w8, [x0, #2]
282267
; BE-NEXT: rev32 v0.8b, v0.8b
283268
; BE-NEXT: ushll v0.8h, v0.8b, #0
284-
; BE-NEXT: mov v0.h[1], v0.h[1]
285269
; BE-NEXT: mov v0.h[2], w8
286270
; BE-NEXT: ushll v0.4s, v0.4h, #0
287271
; BE-NEXT: and v0.16b, v0.16b, v1.16b
@@ -410,12 +394,9 @@ entry:
410394
define void @load_ext_to_64bits(ptr %src, ptr %dst) {
411395
; CHECK-LABEL: load_ext_to_64bits:
412396
; CHECK: ; %bb.0: ; %entry
413-
; CHECK-NEXT: ldrb w8, [x0, #2]
414-
; CHECK-NEXT: ldrh w9, [x0]
415-
; CHECK-NEXT: orr w8, w9, w8, lsl #16
416-
; CHECK-NEXT: fmov s0, w8
397+
; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
417398
; CHECK-NEXT: add x8, x1, #4
418-
; CHECK-NEXT: zip1.8b v0, v0, v0
399+
; CHECK-NEXT: ld1.b { v0 }[2], [x0]
419400
; CHECK-NEXT: bic.4h v0, #255, lsl #8
420401
; CHECK-NEXT: st1.h { v0 }[2], [x8]
421402
; CHECK-NEXT: str s0, [x1]
@@ -507,16 +488,13 @@ entry:
507488
define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
508489
; CHECK-LABEL: load_ext_add_to_64bits:
509490
; CHECK: ; %bb.0: ; %entry
510-
; CHECK-NEXT: ldrb w9, [x0, #2]
511-
; CHECK-NEXT: ldrh w10, [x0]
491+
; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
512492
; CHECK-NEXT: Lloh2:
513493
; CHECK-NEXT: adrp x8, lCPI13_0@PAGE
514494
; CHECK-NEXT: Lloh3:
515495
; CHECK-NEXT: ldr d1, [x8, lCPI13_0@PAGEOFF]
516496
; CHECK-NEXT: add x8, x1, #4
517-
; CHECK-NEXT: orr w9, w10, w9, lsl #16
518-
; CHECK-NEXT: fmov s0, w9
519-
; CHECK-NEXT: zip1.8b v0, v0, v0
497+
; CHECK-NEXT: ld1.b { v0 }[2], [x0]
520498
; CHECK-NEXT: bic.4h v0, #255, lsl #8
521499
; CHECK-NEXT: add.4h v0, v0, v1
522500
; CHECK-NEXT: st1.h { v0 }[2], [x8]

0 commit comments

Comments
 (0)