Skip to content

Commit a1c53b5

Browse files
committed
[AArch64][GlobalISel] Push ADD/SUB through {S|Z}EXT
i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8)) Reduces the amount of shift instructions generated by selecting {s|u}addl instruction as early as possible During instruction selection the result will be selected as: i32 ext (i16 uaddl i8, i8) Instead of: i32 uaddl (i16 ext i8, i16 ext i8)
1 parent 1cccd3f commit a1c53b5

File tree

6 files changed

+300
-251
lines changed

6 files changed

+300
-251
lines changed

llvm/lib/Target/AArch64/AArch64Combine.td

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,25 @@ def ext_uaddv_to_uaddlv : GICombineRule<
5252
(apply [{ applyExtUaddvToUaddlv(*${root}, MRI, B, Observer, ${matchinfo}); }])
5353
>;
5454

55+
// Push G_ADD and G_SUB through G_{Z/S}EXT to allow better selection of addl/subl instructions
56+
// add(ext, ext) => ext(add(ext, ext))
57+
def push_add_matchinfo :
58+
GIDefMatchData<"std::tuple<bool, Register, Register, Register>">;
59+
def push_add_sub_through_ext : GICombineRule<
60+
(defs root:$root, push_add_matchinfo:$matchinfo),
61+
(match (wip_match_opcode G_ADD, G_SUB):$root,
62+
[{ return matchPushAddSubExt(*${root}, MRI, ${matchinfo}); }]),
63+
(apply [{ applyPushAddSubExt(*${root}, MRI, B, ${matchinfo}); }])>;
64+
5565
def AArch64PreLegalizerCombiner: GICombiner<
5666
"AArch64PreLegalizerCombinerImpl", [all_combines,
5767
fconstant_to_constant,
5868
icmp_redundant_trunc,
5969
fold_global_offset,
6070
shuffle_to_extract,
6171
ext_addv_to_udot_addv,
62-
ext_uaddv_to_uaddlv]> {
72+
ext_uaddv_to_uaddlv,
73+
push_add_sub_through_ext]> {
6374
let CombineAllMethodName = "tryCombineAllImpl";
6475
}
6576

llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,74 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
554554
MI.eraseFromParent();
555555
}
556556

557+
// Pushes ADD/SUB through extend instructions to decrease the number of extend
558+
// instruction at the end by allowing selection of {s|u}addl sooner
559+
560+
// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
561+
bool matchPushAddSubExt(
562+
MachineInstr &MI, MachineRegisterInfo &MRI,
563+
std::tuple<bool, Register, Register, Register> &matchinfo) {
564+
assert(MI.getOpcode() == TargetOpcode::G_ADD ||
565+
MI.getOpcode() == TargetOpcode::G_SUB &&
566+
"Expected a G_ADD or G_SUB instruction\n");
567+
MachineInstr *ExtMI1 = MRI.getVRegDef(MI.getOperand(1).getReg());
568+
MachineInstr *ExtMI2 = MRI.getVRegDef(MI.getOperand(2).getReg());
569+
570+
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
571+
if (!DstTy.isVector())
572+
return false;
573+
574+
// Check the source came from G_{S/Z}EXT instructions
575+
if (ExtMI1->getOpcode() != ExtMI2->getOpcode() ||
576+
(ExtMI1->getOpcode() != TargetOpcode::G_SEXT &&
577+
ExtMI1->getOpcode() != TargetOpcode::G_ZEXT))
578+
return false;
579+
580+
if (!MRI.hasOneUse(ExtMI1->getOperand(0).getReg()) ||
581+
!MRI.hasOneUse(ExtMI2->getOperand(0).getReg()))
582+
return false;
583+
584+
// Return true if G_{S|Z}EXT instruction is more than 2* source
585+
Register ExtDstReg = MI.getOperand(1).getReg();
586+
get<0>(matchinfo) = ExtMI1->getOpcode() == TargetOpcode::G_SEXT;
587+
get<1>(matchinfo) = MI.getOperand(0).getReg();
588+
get<2>(matchinfo) = ExtMI1->getOperand(1).getReg();
589+
get<3>(matchinfo) = ExtMI2->getOperand(1).getReg();
590+
591+
LLT ExtDstTy = MRI.getType(ExtDstReg);
592+
LLT Ext1SrcTy = MRI.getType(get<2>(matchinfo));
593+
LLT Ext2SrcTy = MRI.getType(get<3>(matchinfo));
594+
595+
if (((Ext1SrcTy.getScalarSizeInBits() == 8 &&
596+
ExtDstTy.getScalarSizeInBits() == 32) ||
597+
((Ext1SrcTy.getScalarSizeInBits() == 8 ||
598+
Ext1SrcTy.getScalarSizeInBits() == 16) &&
599+
ExtDstTy.getScalarSizeInBits() == 64)) &&
600+
Ext1SrcTy == Ext2SrcTy)
601+
return true;
602+
603+
return false;
604+
}
605+
606+
void applyPushAddSubExt(
607+
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
608+
std::tuple<bool, Register, Register, Register> &matchinfo) {
609+
LLT SrcTy = MRI.getType(get<2>(matchinfo));
610+
LLT MidTy = SrcTy.changeElementSize(SrcTy.getScalarSizeInBits() * 2);
611+
unsigned Opc =
612+
get<0>(matchinfo) ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
613+
Register Ext1Reg = B.buildInstr(Opc, {MidTy}, {get<2>(matchinfo)}).getReg(0);
614+
Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {get<3>(matchinfo)}).getReg(0);
615+
Register AddReg =
616+
B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0);
617+
if (MI.getOpcode() == TargetOpcode::G_ADD)
618+
B.buildInstr(Opc, {get<1>(matchinfo)}, {AddReg});
619+
else
620+
B.buildSExt(get<1>(matchinfo), AddReg);
621+
622+
MI.eraseFromParent();
623+
}
624+
557625
bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
558626
CombinerHelper &Helper, GISelChangeObserver &Observer) {
559627
// Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if

llvm/test/CodeGen/AArch64/aarch64-addv.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,17 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias
9797
; GISEL-NEXT: ldr d1, [x0]
9898
; GISEL-NEXT: ldr d2, [x1]
9999
; GISEL-NEXT: movi v0.2d, #0000000000000000
100-
; GISEL-NEXT: ushll v1.8h, v1.8b, #0
101-
; GISEL-NEXT: ushll v2.8h, v2.8b, #0
102-
; GISEL-NEXT: usubl v3.4s, v1.4h, v2.4h
103-
; GISEL-NEXT: usubl2 v1.4s, v1.8h, v2.8h
104-
; GISEL-NEXT: cmgt v2.4s, v0.4s, v3.4s
105-
; GISEL-NEXT: cmgt v0.4s, v0.4s, v1.4s
106-
; GISEL-NEXT: neg v4.4s, v3.4s
107-
; GISEL-NEXT: neg v5.4s, v1.4s
108-
; GISEL-NEXT: bsl v2.16b, v4.16b, v3.16b
109-
; GISEL-NEXT: bsl v0.16b, v5.16b, v1.16b
110-
; GISEL-NEXT: add v0.4s, v2.4s, v0.4s
100+
; GISEL-NEXT: usubl v1.8h, v1.8b, v2.8b
101+
; GISEL-NEXT: sshll v2.4s, v1.4h, #0
102+
; GISEL-NEXT: sshll2 v3.4s, v1.8h, #0
103+
; GISEL-NEXT: cmgt v4.4s, v0.4s, v2.4s
104+
; GISEL-NEXT: cmgt v5.4s, v0.4s, v3.4s
105+
; GISEL-NEXT: neg v6.4s, v2.4s
106+
; GISEL-NEXT: ssubw2 v0.4s, v0.4s, v1.8h
107+
; GISEL-NEXT: mov v1.16b, v4.16b
108+
; GISEL-NEXT: bif v0.16b, v3.16b, v5.16b
109+
; GISEL-NEXT: bsl v1.16b, v6.16b, v2.16b
110+
; GISEL-NEXT: add v0.4s, v1.4s, v0.4s
111111
; GISEL-NEXT: addv s0, v0.4s
112112
; GISEL-NEXT: fmov w0, s0
113113
; GISEL-NEXT: ret

llvm/test/CodeGen/AArch64/arm64-vabs.ll

Lines changed: 38 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -290,27 +290,26 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
290290
;
291291
; CHECK-GI-LABEL: uabd16b_rdx_i32:
292292
; CHECK-GI: // %bb.0:
293-
; CHECK-GI-NEXT: ushll.8h v3, v0, #0
294-
; CHECK-GI-NEXT: ushll.8h v4, v1, #0
295-
; CHECK-GI-NEXT: ushll2.8h v0, v0, #0
296-
; CHECK-GI-NEXT: ushll2.8h v1, v1, #0
293+
; CHECK-GI-NEXT: usubl.8h v3, v0, v1
294+
; CHECK-GI-NEXT: usubl2.8h v0, v0, v1
297295
; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
298-
; CHECK-GI-NEXT: usubl.4s v5, v3, v4
299-
; CHECK-GI-NEXT: usubl2.4s v3, v3, v4
300-
; CHECK-GI-NEXT: usubl.4s v4, v0, v1
301-
; CHECK-GI-NEXT: usubl2.4s v0, v0, v1
302-
; CHECK-GI-NEXT: cmgt.4s v1, v2, v5
303-
; CHECK-GI-NEXT: cmgt.4s v6, v2, v3
304-
; CHECK-GI-NEXT: neg.4s v16, v5
305-
; CHECK-GI-NEXT: cmgt.4s v7, v2, v4
306-
; CHECK-GI-NEXT: cmgt.4s v2, v2, v0
307-
; CHECK-GI-NEXT: neg.4s v17, v3
308-
; CHECK-GI-NEXT: neg.4s v18, v4
309-
; CHECK-GI-NEXT: neg.4s v19, v0
310-
; CHECK-GI-NEXT: bsl.16b v1, v16, v5
311-
; CHECK-GI-NEXT: bit.16b v3, v17, v6
312-
; CHECK-GI-NEXT: bit.16b v4, v18, v7
313-
; CHECK-GI-NEXT: bit.16b v0, v19, v2
296+
; CHECK-GI-NEXT: sshll2.4s v4, v3, #0
297+
; CHECK-GI-NEXT: sshll.4s v5, v0, #0
298+
; CHECK-GI-NEXT: sshll.4s v1, v3, #0
299+
; CHECK-GI-NEXT: sshll2.4s v6, v0, #0
300+
; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3
301+
; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0
302+
; CHECK-GI-NEXT: cmgt.4s v16, v2, v4
303+
; CHECK-GI-NEXT: cmgt.4s v18, v2, v5
304+
; CHECK-GI-NEXT: cmgt.4s v7, v2, v1
305+
; CHECK-GI-NEXT: neg.4s v17, v1
306+
; CHECK-GI-NEXT: cmgt.4s v2, v2, v6
307+
; CHECK-GI-NEXT: neg.4s v19, v5
308+
; CHECK-GI-NEXT: bif.16b v3, v4, v16
309+
; CHECK-GI-NEXT: mov.16b v4, v18
310+
; CHECK-GI-NEXT: bit.16b v1, v17, v7
311+
; CHECK-GI-NEXT: bif.16b v0, v6, v2
312+
; CHECK-GI-NEXT: bsl.16b v4, v19, v5
314313
; CHECK-GI-NEXT: add.4s v1, v1, v3
315314
; CHECK-GI-NEXT: add.4s v0, v4, v0
316315
; CHECK-GI-NEXT: add.4s v0, v1, v0
@@ -338,27 +337,26 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
338337
;
339338
; CHECK-GI-LABEL: sabd16b_rdx_i32:
340339
; CHECK-GI: // %bb.0:
341-
; CHECK-GI-NEXT: sshll.8h v3, v0, #0
342-
; CHECK-GI-NEXT: sshll.8h v4, v1, #0
343-
; CHECK-GI-NEXT: sshll2.8h v0, v0, #0
344-
; CHECK-GI-NEXT: sshll2.8h v1, v1, #0
340+
; CHECK-GI-NEXT: ssubl.8h v3, v0, v1
341+
; CHECK-GI-NEXT: ssubl2.8h v0, v0, v1
345342
; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
346-
; CHECK-GI-NEXT: ssubl.4s v5, v3, v4
347-
; CHECK-GI-NEXT: ssubl2.4s v3, v3, v4
348-
; CHECK-GI-NEXT: ssubl.4s v4, v0, v1
349-
; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1
350-
; CHECK-GI-NEXT: cmgt.4s v1, v2, v5
351-
; CHECK-GI-NEXT: cmgt.4s v6, v2, v3
352-
; CHECK-GI-NEXT: neg.4s v16, v5
353-
; CHECK-GI-NEXT: cmgt.4s v7, v2, v4
354-
; CHECK-GI-NEXT: cmgt.4s v2, v2, v0
355-
; CHECK-GI-NEXT: neg.4s v17, v3
356-
; CHECK-GI-NEXT: neg.4s v18, v4
357-
; CHECK-GI-NEXT: neg.4s v19, v0
358-
; CHECK-GI-NEXT: bsl.16b v1, v16, v5
359-
; CHECK-GI-NEXT: bit.16b v3, v17, v6
360-
; CHECK-GI-NEXT: bit.16b v4, v18, v7
361-
; CHECK-GI-NEXT: bit.16b v0, v19, v2
343+
; CHECK-GI-NEXT: sshll2.4s v4, v3, #0
344+
; CHECK-GI-NEXT: sshll.4s v5, v0, #0
345+
; CHECK-GI-NEXT: sshll.4s v1, v3, #0
346+
; CHECK-GI-NEXT: sshll2.4s v6, v0, #0
347+
; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3
348+
; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0
349+
; CHECK-GI-NEXT: cmgt.4s v16, v2, v4
350+
; CHECK-GI-NEXT: cmgt.4s v18, v2, v5
351+
; CHECK-GI-NEXT: cmgt.4s v7, v2, v1
352+
; CHECK-GI-NEXT: neg.4s v17, v1
353+
; CHECK-GI-NEXT: cmgt.4s v2, v2, v6
354+
; CHECK-GI-NEXT: neg.4s v19, v5
355+
; CHECK-GI-NEXT: bif.16b v3, v4, v16
356+
; CHECK-GI-NEXT: mov.16b v4, v18
357+
; CHECK-GI-NEXT: bit.16b v1, v17, v7
358+
; CHECK-GI-NEXT: bif.16b v0, v6, v2
359+
; CHECK-GI-NEXT: bsl.16b v4, v19, v5
362360
; CHECK-GI-NEXT: add.4s v1, v1, v3
363361
; CHECK-GI-NEXT: add.4s v0, v4, v0
364362
; CHECK-GI-NEXT: add.4s v0, v1, v0

0 commit comments

Comments
 (0)