Skip to content

Commit bcad0e9

Browse files
committed
[AArch64] Add custom lowering for load <3 x i8>.
Add custom combine to lower load <3 x i8> as the more efficient sequence below: ldrb wX, [x0, swiftlang#2] ldrh wY, [x0] orr wX, wY, wX, lsl swiftlang#16 fmov s0, wX At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790
1 parent d7258d8 commit bcad0e9

File tree

2 files changed

+76
-53
lines changed

2 files changed

+76
-53
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21248,6 +21248,51 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
2124821248
return SDValue();
2124921249
}
2125021250

21251+
// A custom combine to lower load <3 x i8> as the more efficient sequence
21252+
// below:
21253+
// ldrb wX, [x0, #2]
21254+
// ldrh wY, [x0]
21255+
// orr wX, wY, wX, lsl #16
21256+
// fmov s0, wX
21257+
//
21258+
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
21259+
EVT MemVT = LD->getMemoryVT();
21260+
if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21261+
LD->getOriginalAlign() >= 4)
21262+
return SDValue();
21263+
21264+
SDLoc DL(LD);
21265+
SDValue Chain = LD->getChain();
21266+
SDValue BasePtr = LD->getBasePtr();
21267+
assert(LD->getOffset().isUndef() && "undef offset expected");
21268+
21269+
// Load 2 x i8, then 1 x i8.
21270+
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(),
21271+
LD->getOriginalAlign());
21272+
TypeSize Offset2 = TypeSize::getFixed(2);
21273+
SDValue L8 = DAG.getLoad(
21274+
MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21275+
LD->getPointerInfo(), commonAlignment(LD->getOriginalAlign(), Offset2));
21276+
21277+
// Extend to i32.
21278+
SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21279+
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21280+
21281+
// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21282+
SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21283+
DAG.getConstant(16, DL, MVT::i32));
21284+
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr);
21285+
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21286+
21287+
// Extract v3i8 again.
21288+
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21289+
DAG.getConstant(0, DL, MVT::i64));
21290+
SDValue TokenFactor = DAG.getNode(
21291+
ISD::TokenFactor, DL, MVT::Other,
21292+
{SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21293+
return DAG.getMergeValues({Extract, TokenFactor}, DL);
21294+
}
21295+
2125121296
// Perform TBI simplification if supported by the target and try to break up
2125221297
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
2125321298
// load instructions can be selected.
@@ -21259,10 +21304,16 @@ static SDValue performLOADCombine(SDNode *N,
2125921304
performTBISimplification(N->getOperand(1), DCI, DAG);
2126021305

2126121306
LoadSDNode *LD = cast<LoadSDNode>(N);
21262-
EVT MemVT = LD->getMemoryVT();
21263-
if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
21307+
if (LD->isVolatile() || !Subtarget->isLittleEndian())
2126421308
return SDValue(N, 0);
2126521309

21310+
if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21311+
return Res;
21312+
21313+
if (!LD->isNonTemporal())
21314+
return SDValue(N, 0);
21315+
21316+
EVT MemVT = LD->getMemoryVT();
2126621317
if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
2126721318
MemVT.getSizeInBits() % 256 == 0 ||
2126821319
256 % MemVT.getScalarSizeInBits() != 0)

llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll

Lines changed: 23 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,10 @@
55
define <16 x i8> @load_v3i8(ptr %src) {
66
; CHECK-LABEL: load_v3i8:
77
; CHECK: ; %bb.0:
8-
; CHECK-NEXT: sub sp, sp, #16
9-
; CHECK-NEXT: .cfi_def_cfa_offset 16
10-
; CHECK-NEXT: ldrh w8, [x0]
11-
; CHECK-NEXT: strh w8, [sp, #12]
12-
; CHECK-NEXT: ldr s0, [sp, #12]
13-
; CHECK-NEXT: ushll.8h v0, v0, #0
14-
; CHECK-NEXT: umov.h w8, v0[0]
15-
; CHECK-NEXT: umov.h w9, v0[1]
8+
; CHECK-NEXT: ldrb w8, [x0, #2]
9+
; CHECK-NEXT: ldrh w9, [x0]
10+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
1611
; CHECK-NEXT: fmov s0, w8
17-
; CHECK-NEXT: add x8, x0, #2
18-
; CHECK-NEXT: mov.b v0[1], w9
19-
; CHECK-NEXT: ld1.b { v0 }[2], [x8]
20-
; CHECK-NEXT: add sp, sp, #16
2112
; CHECK-NEXT: ret
2213
;
2314
; BE-LABEL: load_v3i8:
@@ -47,19 +38,14 @@ define <16 x i8> @load_v3i8(ptr %src) {
4738
define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
4839
; CHECK-LABEL: load_v3i8_to_4xi32:
4940
; CHECK: ; %bb.0:
50-
; CHECK-NEXT: sub sp, sp, #16
51-
; CHECK-NEXT: .cfi_def_cfa_offset 16
52-
; CHECK-NEXT: ldrh w8, [x0]
41+
; CHECK-NEXT: ldrb w8, [x0, #2]
42+
; CHECK-NEXT: ldrh w9, [x0]
5343
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
54-
; CHECK-NEXT: strh w8, [sp, #12]
55-
; CHECK-NEXT: ldr s0, [sp, #12]
56-
; CHECK-NEXT: ldrsb w8, [x0, #2]
57-
; CHECK-NEXT: ushll.8h v0, v0, #0
58-
; CHECK-NEXT: mov.h v0[1], v0[1]
59-
; CHECK-NEXT: mov.h v0[2], w8
44+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
45+
; CHECK-NEXT: fmov s0, w8
46+
; CHECK-NEXT: zip1.8b v0, v0, v0
6047
; CHECK-NEXT: ushll.4s v0, v0, #0
6148
; CHECK-NEXT: and.16b v0, v0, v1
62-
; CHECK-NEXT: add sp, sp, #16
6349
; CHECK-NEXT: ret
6450
;
6551
; BE-LABEL: load_v3i8_to_4xi32:
@@ -160,19 +146,14 @@ define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) {
160146
define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
161147
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1:
162148
; CHECK: ; %bb.0:
163-
; CHECK-NEXT: sub sp, sp, #16
164-
; CHECK-NEXT: .cfi_def_cfa_offset 16
165-
; CHECK-NEXT: ldurh w8, [x0, #1]
149+
; CHECK-NEXT: ldrb w8, [x0, #3]
150+
; CHECK-NEXT: ldurh w9, [x0, #1]
166151
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
167-
; CHECK-NEXT: strh w8, [sp, #12]
168-
; CHECK-NEXT: ldr s0, [sp, #12]
169-
; CHECK-NEXT: ldrsb w8, [x0, #3]
170-
; CHECK-NEXT: ushll.8h v0, v0, #0
171-
; CHECK-NEXT: mov.h v0[1], v0[1]
172-
; CHECK-NEXT: mov.h v0[2], w8
152+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
153+
; CHECK-NEXT: fmov s0, w8
154+
; CHECK-NEXT: zip1.8b v0, v0, v0
173155
; CHECK-NEXT: ushll.4s v0, v0, #0
174156
; CHECK-NEXT: and.16b v0, v0, v1
175-
; CHECK-NEXT: add sp, sp, #16
176157
; CHECK-NEXT: ret
177158
;
178159
; BE-LABEL: load_v3i8_to_4xi32_const_offset_1:
@@ -204,19 +185,14 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
204185
define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
205186
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3:
206187
; CHECK: ; %bb.0:
207-
; CHECK-NEXT: sub sp, sp, #16
208-
; CHECK-NEXT: .cfi_def_cfa_offset 16
209-
; CHECK-NEXT: ldurh w8, [x0, #3]
188+
; CHECK-NEXT: ldrb w8, [x0, #5]
189+
; CHECK-NEXT: ldurh w9, [x0, #3]
210190
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
211-
; CHECK-NEXT: strh w8, [sp, #12]
212-
; CHECK-NEXT: ldr s0, [sp, #12]
213-
; CHECK-NEXT: ldrsb w8, [x0, #5]
214-
; CHECK-NEXT: ushll.8h v0, v0, #0
215-
; CHECK-NEXT: mov.h v0[1], v0[1]
216-
; CHECK-NEXT: mov.h v0[2], w8
191+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
192+
; CHECK-NEXT: fmov s0, w8
193+
; CHECK-NEXT: zip1.8b v0, v0, v0
217194
; CHECK-NEXT: ushll.4s v0, v0, #0
218195
; CHECK-NEXT: and.16b v0, v0, v1
219-
; CHECK-NEXT: add sp, sp, #16
220196
; CHECK-NEXT: ret
221197
;
222198
; BE-LABEL: load_v3i8_to_4xi32_const_offset_3:
@@ -439,19 +415,15 @@ entry:
439415
define void @load_ext_to_64bits(ptr %src, ptr %dst) {
440416
; CHECK-LABEL: load_ext_to_64bits:
441417
; CHECK: ; %bb.0: ; %entry
442-
; CHECK-NEXT: sub sp, sp, #16
443-
; CHECK-NEXT: .cfi_def_cfa_offset 16
444-
; CHECK-NEXT: ldrh w8, [x0]
445-
; CHECK-NEXT: strh w8, [sp, #12]
446-
; CHECK-NEXT: add x8, x0, #2
447-
; CHECK-NEXT: ldr s0, [sp, #12]
448-
; CHECK-NEXT: ushll.8h v0, v0, #0
449-
; CHECK-NEXT: ld1.b { v0 }[4], [x8]
418+
; CHECK-NEXT: ldrb w8, [x0, #2]
419+
; CHECK-NEXT: ldrh w9, [x0]
420+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
421+
; CHECK-NEXT: fmov s0, w8
450422
; CHECK-NEXT: add x8, x1, #4
423+
; CHECK-NEXT: zip1.8b v0, v0, v0
451424
; CHECK-NEXT: bic.4h v0, #255, lsl #8
452425
; CHECK-NEXT: st1.h { v0 }[2], [x8]
453426
; CHECK-NEXT: str s0, [x1]
454-
; CHECK-NEXT: add sp, sp, #16
455427
; CHECK-NEXT: ret
456428
;
457429
; BE-LABEL: load_ext_to_64bits:

0 commit comments

Comments
 (0)