Skip to content

Commit 25f139e

Browse files
committed
[AArch64] Combine store (trunc X to <3 x i8>) to sequence of ST1.b. (llvm#78637)
Improve codegen for (trunc X to <3 x i8>) by converting it to a sequence of 3 ST1.b, but first converting the truncate operand to either v8i8 or v16i8, extracting the lanes for the truncate results and storing them. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790 PR: llvm#78637 (cherry-picked from eb678d8)
1 parent 3d6f586 commit 25f139e

File tree

2 files changed

+96
-70
lines changed

2 files changed

+96
-70
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20937,6 +20937,53 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
2093720937
Store->getMemOperand());
2093820938
}
2093920939

20940+
// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
20941+
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
20942+
const AArch64Subtarget *Subtarget) {
20943+
SDValue Value = ST->getValue();
20944+
EVT ValueVT = Value.getValueType();
20945+
20946+
if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
20947+
Value.getOpcode() != ISD::TRUNCATE ||
20948+
ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
20949+
return SDValue();
20950+
20951+
assert(ST->getOffset().isUndef() && "undef offset expected");
20952+
SDLoc DL(ST);
20953+
auto WideVT = EVT::getVectorVT(
20954+
*DAG.getContext(),
20955+
Value->getOperand(0).getValueType().getVectorElementType(), 4);
20956+
SDValue UndefVector = DAG.getUNDEF(WideVT);
20957+
SDValue WideTrunc = DAG.getNode(
20958+
ISD::INSERT_SUBVECTOR, DL, WideVT,
20959+
{UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
20960+
SDValue Cast = DAG.getNode(
20961+
ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
20962+
WideTrunc);
20963+
20964+
MachineFunction &MF = DAG.getMachineFunction();
20965+
SDValue Chain = ST->getChain();
20966+
MachineMemOperand *MMO = ST->getMemOperand();
20967+
unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
20968+
SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
20969+
DAG.getConstant(2 * IdxScale, DL, MVT::i64));
20970+
TypeSize Offset2 = TypeSize::getFixed(2);
20971+
SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
20972+
Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
20973+
20974+
SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
20975+
DAG.getConstant(1 * IdxScale, DL, MVT::i64));
20976+
TypeSize Offset1 = TypeSize::getFixed(1);
20977+
SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
20978+
Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
20979+
20980+
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
20981+
DAG.getConstant(0, DL, MVT::i64));
20982+
Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
20983+
MF.getMachineMemOperand(MMO, 0, 1));
20984+
return Chain;
20985+
}
20986+
2094020987
static SDValue performSTORECombine(SDNode *N,
2094120988
TargetLowering::DAGCombinerInfo &DCI,
2094220989
SelectionDAG &DAG,
@@ -20952,6 +20999,9 @@ static SDValue performSTORECombine(SDNode *N,
2095220999
return EltVT == MVT::f32 || EltVT == MVT::f64;
2095321000
};
2095421001

21002+
if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
21003+
return Res;
21004+
2095521005
// If this is an FP_ROUND followed by a store, fold this into a truncating
2095621006
// store. We can do this even if this is already a truncstore.
2095721007
// We purposefully don't care about legality of the nodes here as we know

llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll

Lines changed: 46 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -349,17 +349,15 @@ define <3 x i32> @load_v3i32(ptr %src) {
349349
define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
350350
; CHECK-LABEL: store_trunc_from_64bits:
351351
; CHECK: ; %bb.0: ; %entry
352-
; CHECK-NEXT: sub sp, sp, #16
353-
; CHECK-NEXT: .cfi_def_cfa_offset 16
354-
; CHECK-NEXT: ldr s0, [x0]
355-
; CHECK-NEXT: ldrh w8, [x0, #4]
356-
; CHECK-NEXT: mov.h v0[2], w8
357-
; CHECK-NEXT: xtn.8b v0, v0
358-
; CHECK-NEXT: str s0, [sp, #12]
359-
; CHECK-NEXT: ldrh w9, [sp, #12]
360-
; CHECK-NEXT: strb w8, [x1, #2]
361-
; CHECK-NEXT: strh w9, [x1]
362-
; CHECK-NEXT: add sp, sp, #16
352+
; CHECK-NEXT: add x8, x0, #4
353+
; CHECK-NEXT: ldr w9, [x0]
354+
; CHECK-NEXT: add x10, x1, #1
355+
; CHECK-NEXT: ld1r.4h { v0 }, [x8]
356+
; CHECK-NEXT: fmov s1, w9
357+
; CHECK-NEXT: add x8, x1, #2
358+
; CHECK-NEXT: strb w9, [x1]
359+
; CHECK-NEXT: st1.b { v1 }[2], [x10]
360+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
363361
; CHECK-NEXT: ret
364362
;
365363
; BE-LABEL: store_trunc_from_64bits:
@@ -388,23 +386,19 @@ entry:
388386
define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
389387
; CHECK-LABEL: store_trunc_add_from_64bits:
390388
; CHECK: ; %bb.0: ; %entry
391-
; CHECK-NEXT: sub sp, sp, #16
392-
; CHECK-NEXT: .cfi_def_cfa_offset 16
393389
; CHECK-NEXT: add x8, x0, #4
394390
; CHECK-NEXT: ldr s0, [x0]
395391
; CHECK-NEXT: Lloh0:
396392
; CHECK-NEXT: adrp x9, lCPI9_0@PAGE
397393
; CHECK-NEXT: ld1.h { v0 }[2], [x8]
394+
; CHECK-NEXT: add x8, x1, #1
398395
; CHECK-NEXT: Lloh1:
399396
; CHECK-NEXT: ldr d1, [x9, lCPI9_0@PAGEOFF]
397+
; CHECK-NEXT: add x9, x1, #2
400398
; CHECK-NEXT: add.4h v0, v0, v1
401-
; CHECK-NEXT: xtn.8b v1, v0
402-
; CHECK-NEXT: umov.h w8, v0[2]
403-
; CHECK-NEXT: str s1, [sp, #12]
404-
; CHECK-NEXT: ldrh w9, [sp, #12]
405-
; CHECK-NEXT: strb w8, [x1, #2]
406-
; CHECK-NEXT: strh w9, [x1]
407-
; CHECK-NEXT: add sp, sp, #16
399+
; CHECK-NEXT: st1.b { v0 }[2], [x8]
400+
; CHECK-NEXT: st1.b { v0 }[4], [x9]
401+
; CHECK-NEXT: st1.b { v0 }[0], [x1]
408402
; CHECK-NEXT: ret
409403
; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1
410404
;
@@ -597,17 +591,13 @@ entry:
597591
define void @shift_trunc_store(ptr %src, ptr %dst) {
598592
; CHECK-LABEL: shift_trunc_store:
599593
; CHECK: ; %bb.0:
600-
; CHECK-NEXT: sub sp, sp, #16
601-
; CHECK-NEXT: .cfi_def_cfa_offset 16
602594
; CHECK-NEXT: ldr q0, [x0]
603-
; CHECK-NEXT: shrn.4h v0, v0, #16
604-
; CHECK-NEXT: xtn.8b v1, v0
605-
; CHECK-NEXT: umov.h w8, v0[2]
606-
; CHECK-NEXT: str s1, [sp, #12]
607-
; CHECK-NEXT: ldrh w9, [sp, #12]
608-
; CHECK-NEXT: strb w8, [x1, #2]
609-
; CHECK-NEXT: strh w9, [x1]
610-
; CHECK-NEXT: add sp, sp, #16
595+
; CHECK-NEXT: add x8, x1, #1
596+
; CHECK-NEXT: add x9, x1, #2
597+
; CHECK-NEXT: ushr.4s v0, v0, #16
598+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
599+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
600+
; CHECK-NEXT: st1.b { v0 }[0], [x1]
611601
; CHECK-NEXT: ret
612602
;
613603
; BE-LABEL: shift_trunc_store:
@@ -635,17 +625,13 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
635625
define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
636626
; CHECK-LABEL: shift_trunc_store_default_align:
637627
; CHECK: ; %bb.0:
638-
; CHECK-NEXT: sub sp, sp, #16
639-
; CHECK-NEXT: .cfi_def_cfa_offset 16
640628
; CHECK-NEXT: ldr q0, [x0]
641-
; CHECK-NEXT: shrn.4h v0, v0, #16
642-
; CHECK-NEXT: xtn.8b v1, v0
643-
; CHECK-NEXT: umov.h w8, v0[2]
644-
; CHECK-NEXT: str s1, [sp, #12]
645-
; CHECK-NEXT: ldrh w9, [sp, #12]
646-
; CHECK-NEXT: strb w8, [x1, #2]
647-
; CHECK-NEXT: strh w9, [x1]
648-
; CHECK-NEXT: add sp, sp, #16
629+
; CHECK-NEXT: add x8, x1, #1
630+
; CHECK-NEXT: add x9, x1, #2
631+
; CHECK-NEXT: ushr.4s v0, v0, #16
632+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
633+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
634+
; CHECK-NEXT: st1.b { v0 }[0], [x1]
649635
; CHECK-NEXT: ret
650636
;
651637
; BE-LABEL: shift_trunc_store_default_align:
@@ -673,17 +659,13 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
673659
define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
674660
; CHECK-LABEL: shift_trunc_store_align_4:
675661
; CHECK: ; %bb.0:
676-
; CHECK-NEXT: sub sp, sp, #16
677-
; CHECK-NEXT: .cfi_def_cfa_offset 16
678662
; CHECK-NEXT: ldr q0, [x0]
679-
; CHECK-NEXT: shrn.4h v0, v0, #16
680-
; CHECK-NEXT: xtn.8b v1, v0
681-
; CHECK-NEXT: umov.h w8, v0[2]
682-
; CHECK-NEXT: str s1, [sp, #12]
683-
; CHECK-NEXT: ldrh w9, [sp, #12]
684-
; CHECK-NEXT: strb w8, [x1, #2]
685-
; CHECK-NEXT: strh w9, [x1]
686-
; CHECK-NEXT: add sp, sp, #16
663+
; CHECK-NEXT: add x8, x1, #1
664+
; CHECK-NEXT: add x9, x1, #2
665+
; CHECK-NEXT: ushr.4s v0, v0, #16
666+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
667+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
668+
; CHECK-NEXT: st1.b { v0 }[0], [x1]
687669
; CHECK-NEXT: ret
688670
;
689671
; BE-LABEL: shift_trunc_store_align_4:
@@ -711,17 +693,14 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
711693
define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
712694
; CHECK-LABEL: shift_trunc_store_const_offset_1:
713695
; CHECK: ; %bb.0:
714-
; CHECK-NEXT: sub sp, sp, #16
715-
; CHECK-NEXT: .cfi_def_cfa_offset 16
716696
; CHECK-NEXT: ldr q0, [x0]
717-
; CHECK-NEXT: shrn.4h v0, v0, #16
718-
; CHECK-NEXT: xtn.8b v1, v0
719-
; CHECK-NEXT: umov.h w8, v0[2]
720-
; CHECK-NEXT: str s1, [sp, #12]
721-
; CHECK-NEXT: ldrh w9, [sp, #12]
722-
; CHECK-NEXT: strb w8, [x1, #3]
723-
; CHECK-NEXT: sturh w9, [x1, #1]
724-
; CHECK-NEXT: add sp, sp, #16
697+
; CHECK-NEXT: add x8, x1, #2
698+
; CHECK-NEXT: add x9, x1, #3
699+
; CHECK-NEXT: add x10, x1, #1
700+
; CHECK-NEXT: ushr.4s v0, v0, #16
701+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
702+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
703+
; CHECK-NEXT: st1.b { v0 }[0], [x10]
725704
; CHECK-NEXT: ret
726705
;
727706
; BE-LABEL: shift_trunc_store_const_offset_1:
@@ -750,17 +729,14 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
750729
define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
751730
; CHECK-LABEL: shift_trunc_store_const_offset_3:
752731
; CHECK: ; %bb.0:
753-
; CHECK-NEXT: sub sp, sp, #16
754-
; CHECK-NEXT: .cfi_def_cfa_offset 16
755732
; CHECK-NEXT: ldr q0, [x0]
756-
; CHECK-NEXT: shrn.4h v0, v0, #16
757-
; CHECK-NEXT: xtn.8b v1, v0
758-
; CHECK-NEXT: umov.h w8, v0[2]
759-
; CHECK-NEXT: str s1, [sp, #12]
760-
; CHECK-NEXT: ldrh w9, [sp, #12]
761-
; CHECK-NEXT: strb w8, [x1, #5]
762-
; CHECK-NEXT: sturh w9, [x1, #3]
763-
; CHECK-NEXT: add sp, sp, #16
733+
; CHECK-NEXT: add x8, x1, #4
734+
; CHECK-NEXT: add x9, x1, #5
735+
; CHECK-NEXT: add x10, x1, #3
736+
; CHECK-NEXT: ushr.4s v0, v0, #16
737+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
738+
; CHECK-NEXT: st1.b { v0 }[8], [x9]
739+
; CHECK-NEXT: st1.b { v0 }[0], [x10]
764740
; CHECK-NEXT: ret
765741
;
766742
; BE-LABEL: shift_trunc_store_const_offset_3:

0 commit comments

Comments
 (0)