Skip to content

Commit efd07e9

Browse files
committed
[AArch64] Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
Improve codegen for (trunc X to <3 x i8>) by converting it to a sequence of 3 ST1.b, but first converting the truncate operand to either v8i8 or v16i8, extracting the lanes for the truncate results and storing them. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790
1 parent 8336515 commit efd07e9

File tree

2 files changed

+62
-21
lines changed

2 files changed

+62
-21
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21318,6 +21318,53 @@ bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
2131821318
(SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
2131921319
}
2132021320

21321+
// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
21322+
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
21323+
const AArch64Subtarget *Subtarget) {
21324+
SDValue Value = ST->getValue();
21325+
EVT ValueVT = Value.getValueType();
21326+
21327+
if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
21328+
ST->getOriginalAlign() >= 4 || Value.getOpcode() != ISD::TRUNCATE ||
21329+
ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
21330+
return SDValue();
21331+
21332+
SDLoc DL(ST);
21333+
auto WideVT = EVT::getVectorVT(
21334+
*DAG.getContext(),
21335+
Value->getOperand(0).getValueType().getVectorElementType(), 4);
21336+
SDValue UndefVector = DAG.getUNDEF(WideVT);
21337+
SDValue WideTrunc = DAG.getNode(
21338+
ISD::INSERT_SUBVECTOR, DL, WideVT,
21339+
{UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
21340+
SDValue Cast = DAG.getNode(
21341+
ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
21342+
WideTrunc);
21343+
21344+
SDValue Chain = ST->getChain();
21345+
SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
21346+
DAG.getConstant(8, DL, MVT::i64));
21347+
21348+
SDValue Ptr2 =
21349+
DAG.getMemBasePlusOffset(ST->getBasePtr(), TypeSize::getFixed(2), DL);
21350+
Chain = DAG.getStore(Chain, DL, E2, Ptr2, ST->getPointerInfo(),
21351+
ST->getOriginalAlign());
21352+
21353+
SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
21354+
DAG.getConstant(4, DL, MVT::i64));
21355+
21356+
SDValue Ptr1 =
21357+
DAG.getMemBasePlusOffset(ST->getBasePtr(), TypeSize::getFixed(1), DL);
21358+
Chain = DAG.getStore(Chain, DL, E1, Ptr1, ST->getPointerInfo(),
21359+
ST->getOriginalAlign());
21360+
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
21361+
DAG.getConstant(0, DL, MVT::i64));
21362+
Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), ST->getPointerInfo(),
21363+
ST->getOriginalAlign());
21364+
21365+
return Chain;
21366+
}
21367+
2132121368
static SDValue performSTORECombine(SDNode *N,
2132221369
TargetLowering::DAGCombinerInfo &DCI,
2132321370
SelectionDAG &DAG,
@@ -21333,6 +21380,9 @@ static SDValue performSTORECombine(SDNode *N,
2133321380
return EltVT == MVT::f32 || EltVT == MVT::f64;
2133421381
};
2133521382

21383+
if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
21384+
return Res;
21385+
2133621386
// If this is an FP_ROUND followed by a store, fold this into a truncating
2133721387
// store. We can do this even if this is already a truncstore.
2133821388
// We purposefully don't care about legality of the nodes here as we know

llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -154,17 +154,12 @@ define <3 x i32> @load_v3i32(ptr %src) {
154154
define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
155155
; CHECK-LABEL: store_trunc_from_64bits:
156156
; CHECK: ; %bb.0: ; %entry
157-
; CHECK-NEXT: sub sp, sp, #16
158-
; CHECK-NEXT: .cfi_def_cfa_offset 16
159-
; CHECK-NEXT: ldr s0, [x0]
160-
; CHECK-NEXT: ldrh w8, [x0, #4]
161-
; CHECK-NEXT: mov.h v0[2], w8
162-
; CHECK-NEXT: xtn.8b v0, v0
163-
; CHECK-NEXT: str s0, [sp, #12]
164-
; CHECK-NEXT: ldrh w9, [sp, #12]
165-
; CHECK-NEXT: strb w8, [x1, #2]
166-
; CHECK-NEXT: strh w9, [x1]
167-
; CHECK-NEXT: add sp, sp, #16
157+
; CHECK-NEXT: add x8, x0, #4
158+
; CHECK-NEXT: ld1r.4h { v0 }, [x8]
159+
; CHECK-NEXT: ldr w8, [x0]
160+
; CHECK-NEXT: strb w8, [x1]
161+
; CHECK-NEXT: add x8, x1, #1
162+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
168163
; CHECK-NEXT: ret
169164
;
170165
; BE-LABEL: store_trunc_from_64bits:
@@ -236,17 +231,13 @@ entry:
236231
define void @shift_trunc_store(ptr %src, ptr %dst) {
237232
; CHECK-LABEL: shift_trunc_store:
238233
; CHECK: ; %bb.0:
239-
; CHECK-NEXT: sub sp, sp, #16
240-
; CHECK-NEXT: .cfi_def_cfa_offset 16
241234
; CHECK-NEXT: ldr q0, [x0]
242-
; CHECK-NEXT: shrn.4h v0, v0, #16
243-
; CHECK-NEXT: xtn.8b v1, v0
244-
; CHECK-NEXT: umov.h w8, v0[2]
245-
; CHECK-NEXT: str s1, [sp, #12]
246-
; CHECK-NEXT: ldrh w9, [sp, #12]
247-
; CHECK-NEXT: strb w8, [x1, #2]
248-
; CHECK-NEXT: strh w9, [x1]
249-
; CHECK-NEXT: add sp, sp, #16
235+
; CHECK-NEXT: add x8, x1, #1
236+
; CHECK-NEXT: add x9, x1, #2
237+
; CHECK-NEXT: ushr.4s v0, v0, #16
238+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
239+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
240+
; CHECK-NEXT: st1.b { v0 }[0], [x1]
250241
; CHECK-NEXT: ret
251242
;
252243
; BE-LABEL: shift_trunc_store:

0 commit comments

Comments
 (0)