Skip to content

[AArch64][GlobalISel] Push ADD/SUB through Extend Instructions #90964

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion llvm/lib/Target/AArch64/AArch64Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,31 @@ def ext_uaddv_to_uaddlv : GICombineRule<
(apply [{ applyExtUaddvToUaddlv(*${root}, MRI, B, Observer, ${matchinfo}); }])
>;

class push_opcode_through_ext<Instruction opcode, Instruction extOpcode> : GICombineRule <
(defs root:$root),
(match (extOpcode $ext1, $src1):$ExtMI,
(extOpcode $ext2, $src2),
(opcode $dst, $ext1, $ext2):$root,
[{ return matchPushAddSubExt(*${root}, MRI, ${dst}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]),
(apply [{ applyPushAddSubExt(*${root}, MRI, B, ${ExtMI}->getOpcode() == TargetOpcode::G_SEXT, ${dst}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }])>;

def push_sub_through_zext : push_opcode_through_ext<G_SUB, G_ZEXT>;
def push_add_through_zext : push_opcode_through_ext<G_ADD, G_ZEXT>;
def push_sub_through_sext : push_opcode_through_ext<G_SUB, G_SEXT>;
def push_add_through_sext : push_opcode_through_ext<G_ADD, G_SEXT>;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!


def AArch64PreLegalizerCombiner: GICombiner<
"AArch64PreLegalizerCombinerImpl", [all_combines,
fconstant_to_constant,
icmp_redundant_trunc,
fold_global_offset,
shuffle_to_extract,
ext_addv_to_udot_addv,
ext_uaddv_to_uaddlv]> {
ext_uaddv_to_uaddlv,
push_sub_through_zext,
push_add_through_zext,
push_sub_through_sext,
push_add_through_sext]> {
let CombineAllMethodName = "tryCombineAllImpl";
}

Expand Down
51 changes: 51 additions & 0 deletions llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,57 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
MI.eraseFromParent();
}

// Pushes ADD/SUB through extend instructions to decrease the number of extend
// instruction at the end by allowing selection of {s|u}addl sooner

// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
Register DstReg, Register SrcReg1, Register SrcReg2) {
assert(MI.getOpcode() == TargetOpcode::G_ADD ||
MI.getOpcode() == TargetOpcode::G_SUB &&
"Expected a G_ADD or G_SUB instruction\n");

// Deal with vector types only
LLT DstTy = MRI.getType(DstReg);
if (!DstTy.isVector())
return false;

// Return true if G_{S|Z}EXT instruction is more than 2* source
Register ExtDstReg = MI.getOperand(1).getReg();
LLT Ext1SrcTy = MRI.getType(SrcReg1);
LLT Ext2SrcTy = MRI.getType(SrcReg2);
unsigned ExtDstScal = MRI.getType(ExtDstReg).getScalarSizeInBits();
unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits();
if (((Ext1SrcScal == 8 && ExtDstScal == 32) ||
((Ext1SrcScal == 8 || Ext1SrcScal == 16) && ExtDstScal == 64)) &&
Ext1SrcTy == Ext2SrcTy)
return true;

return false;
}

void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool isSExt, Register DstReg,
Register SrcReg1, Register SrcReg2) {
LLT SrcTy = MRI.getType(SrcReg1);
LLT MidTy = SrcTy.changeElementSize(SrcTy.getScalarSizeInBits() * 2);
unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
Register Ext1Reg = B.buildInstr(Opc, {MidTy}, {SrcReg1}).getReg(0);
Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {SrcReg2}).getReg(0);
Register AddReg =
B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0);

// G_SUB has to sign-extend the result.
// G_ADD needs to sext from sext and can sext or zext from zext, so the
// original opcode is used.
if (MI.getOpcode() == TargetOpcode::G_ADD)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment explaining the outer extends.

B.buildInstr(Opc, {DstReg}, {AddReg});
else
B.buildSExt(DstReg, AddReg);

MI.eraseFromParent();
}

bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
CombinerHelper &Helper, GISelChangeObserver &Observer) {
// Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
Expand Down
36 changes: 20 additions & 16 deletions llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>)
; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>)
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[SEXT]], [[SEXT1]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>)
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY]](<8 x s8>)
; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY1]](<8 x s8>)
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[SEXT]], [[SEXT1]]
; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[ADD]](<8 x s16>)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT2]](<8 x s32>)
; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
Expand All @@ -249,10 +250,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>)
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>)
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[ZEXT]], [[ZEXT1]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY]](<8 x s8>)
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY1]](<8 x s8>)
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[ZEXT]], [[ZEXT1]]
; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[ADD]](<8 x s16>)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ZEXT2]](<8 x s32>)
; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
Expand All @@ -279,10 +281,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>)
; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>)
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[SEXT]], [[SEXT1]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>)
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY]](<8 x s8>)
; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY1]](<8 x s8>)
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[SEXT]], [[SEXT1]]
; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[SUB]](<8 x s16>)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT2]](<8 x s32>)
; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
Expand All @@ -309,10 +312,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>)
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>)
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[ZEXT]], [[ZEXT1]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY]](<8 x s8>)
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY1]](<8 x s8>)
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[ZEXT]], [[ZEXT1]]
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[SUB]](<8 x s16>)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT]](<8 x s32>)
; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
Expand Down
25 changes: 13 additions & 12 deletions llvm/test/CodeGen/AArch64/aarch64-addv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -94,18 +94,19 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias
;
; GISEL-LABEL: oversized_ADDV_256:
; GISEL: // %bb.0: // %entry
; GISEL-NEXT: ldr d0, [x0]
; GISEL-NEXT: ldr d1, [x1]
; GISEL-NEXT: ushll v0.8h, v0.8b, #0
; GISEL-NEXT: ushll v1.8h, v1.8b, #0
; GISEL-NEXT: usubl v2.4s, v0.4h, v1.4h
; GISEL-NEXT: usubl2 v0.4s, v0.8h, v1.8h
; GISEL-NEXT: cmlt v1.4s, v2.4s, #0
; GISEL-NEXT: cmlt v3.4s, v0.4s, #0
; GISEL-NEXT: neg v4.4s, v2.4s
; GISEL-NEXT: neg v5.4s, v0.4s
; GISEL-NEXT: bsl v1.16b, v4.16b, v2.16b
; GISEL-NEXT: bit v0.16b, v5.16b, v3.16b
; GISEL-NEXT: ldr d1, [x0]
; GISEL-NEXT: ldr d2, [x1]
; GISEL-NEXT: movi v0.2d, #0000000000000000
; GISEL-NEXT: usubl v1.8h, v1.8b, v2.8b
; GISEL-NEXT: sshll v2.4s, v1.4h, #0
; GISEL-NEXT: sshll2 v3.4s, v1.8h, #0
; GISEL-NEXT: ssubw2 v0.4s, v0.4s, v1.8h
; GISEL-NEXT: cmlt v4.4s, v2.4s, #0
; GISEL-NEXT: cmlt v5.4s, v3.4s, #0
; GISEL-NEXT: neg v6.4s, v2.4s
; GISEL-NEXT: mov v1.16b, v4.16b
; GISEL-NEXT: bif v0.16b, v3.16b, v5.16b
; GISEL-NEXT: bsl v1.16b, v6.16b, v2.16b
; GISEL-NEXT: add v0.4s, v1.4s, v0.4s
; GISEL-NEXT: addv s0, v0.4s
; GISEL-NEXT: fmov w0, s0
Expand Down
82 changes: 42 additions & 40 deletions llvm/test/CodeGen/AArch64/arm64-vabs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -289,26 +289,27 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
;
; CHECK-GI-LABEL: uabd16b_rdx_i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ushll.8h v2, v0, #0
; CHECK-GI-NEXT: ushll.8h v3, v1, #0
; CHECK-GI-NEXT: ushll2.8h v0, v0, #0
; CHECK-GI-NEXT: ushll2.8h v1, v1, #0
; CHECK-GI-NEXT: usubl.4s v4, v2, v3
; CHECK-GI-NEXT: usubl2.4s v2, v2, v3
; CHECK-GI-NEXT: usubl.4s v3, v0, v1
; CHECK-GI-NEXT: usubl2.4s v0, v0, v1
; CHECK-GI-NEXT: cmlt.4s v1, v4, #0
; CHECK-GI-NEXT: cmlt.4s v5, v2, #0
; CHECK-GI-NEXT: neg.4s v16, v4
; CHECK-GI-NEXT: cmlt.4s v6, v3, #0
; CHECK-GI-NEXT: cmlt.4s v7, v0, #0
; CHECK-GI-NEXT: neg.4s v17, v2
; CHECK-GI-NEXT: neg.4s v18, v3
; CHECK-GI-NEXT: neg.4s v19, v0
; CHECK-GI-NEXT: bsl.16b v1, v16, v4
; CHECK-GI-NEXT: bit.16b v2, v17, v5
; CHECK-GI-NEXT: bit.16b v3, v18, v6
; CHECK-GI-NEXT: bit.16b v0, v19, v7
; CHECK-GI-NEXT: usubl.8h v3, v0, v1
; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
; CHECK-GI-NEXT: usubl2.8h v0, v0, v1
; CHECK-GI-NEXT: sshll.4s v1, v3, #0
; CHECK-GI-NEXT: sshll2.4s v4, v3, #0
; CHECK-GI-NEXT: sshll.4s v5, v0, #0
; CHECK-GI-NEXT: sshll2.4s v6, v0, #0
; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3
; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0
; CHECK-GI-NEXT: cmlt.4s v2, v1, #0
; CHECK-GI-NEXT: cmlt.4s v7, v4, #0
; CHECK-GI-NEXT: neg.4s v16, v1
; CHECK-GI-NEXT: cmlt.4s v17, v5, #0
; CHECK-GI-NEXT: cmlt.4s v18, v6, #0
; CHECK-GI-NEXT: neg.4s v19, v5
; CHECK-GI-NEXT: bit.16b v1, v16, v2
; CHECK-GI-NEXT: mov.16b v2, v7
; CHECK-GI-NEXT: bif.16b v0, v6, v18
; CHECK-GI-NEXT: bsl.16b v2, v3, v4
; CHECK-GI-NEXT: mov.16b v3, v17
; CHECK-GI-NEXT: bsl.16b v3, v19, v5
; CHECK-GI-NEXT: add.4s v1, v1, v2
; CHECK-GI-NEXT: add.4s v0, v3, v0
; CHECK-GI-NEXT: add.4s v0, v1, v0
Expand Down Expand Up @@ -336,26 +337,27 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
;
; CHECK-GI-LABEL: sabd16b_rdx_i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: sshll.8h v2, v0, #0
; CHECK-GI-NEXT: sshll.8h v3, v1, #0
; CHECK-GI-NEXT: sshll2.8h v0, v0, #0
; CHECK-GI-NEXT: sshll2.8h v1, v1, #0
; CHECK-GI-NEXT: ssubl.4s v4, v2, v3
; CHECK-GI-NEXT: ssubl2.4s v2, v2, v3
; CHECK-GI-NEXT: ssubl.4s v3, v0, v1
; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1
; CHECK-GI-NEXT: cmlt.4s v1, v4, #0
; CHECK-GI-NEXT: cmlt.4s v5, v2, #0
; CHECK-GI-NEXT: neg.4s v16, v4
; CHECK-GI-NEXT: cmlt.4s v6, v3, #0
; CHECK-GI-NEXT: cmlt.4s v7, v0, #0
; CHECK-GI-NEXT: neg.4s v17, v2
; CHECK-GI-NEXT: neg.4s v18, v3
; CHECK-GI-NEXT: neg.4s v19, v0
; CHECK-GI-NEXT: bsl.16b v1, v16, v4
; CHECK-GI-NEXT: bit.16b v2, v17, v5
; CHECK-GI-NEXT: bit.16b v3, v18, v6
; CHECK-GI-NEXT: bit.16b v0, v19, v7
; CHECK-GI-NEXT: ssubl.8h v3, v0, v1
; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
; CHECK-GI-NEXT: ssubl2.8h v0, v0, v1
; CHECK-GI-NEXT: sshll.4s v1, v3, #0
; CHECK-GI-NEXT: sshll2.4s v4, v3, #0
; CHECK-GI-NEXT: sshll.4s v5, v0, #0
; CHECK-GI-NEXT: sshll2.4s v6, v0, #0
; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3
; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0
; CHECK-GI-NEXT: cmlt.4s v2, v1, #0
; CHECK-GI-NEXT: cmlt.4s v7, v4, #0
; CHECK-GI-NEXT: neg.4s v16, v1
; CHECK-GI-NEXT: cmlt.4s v17, v5, #0
; CHECK-GI-NEXT: cmlt.4s v18, v6, #0
; CHECK-GI-NEXT: neg.4s v19, v5
; CHECK-GI-NEXT: bit.16b v1, v16, v2
; CHECK-GI-NEXT: mov.16b v2, v7
; CHECK-GI-NEXT: bif.16b v0, v6, v18
; CHECK-GI-NEXT: bsl.16b v2, v3, v4
; CHECK-GI-NEXT: mov.16b v3, v17
; CHECK-GI-NEXT: bsl.16b v3, v19, v5
; CHECK-GI-NEXT: add.4s v1, v1, v2
; CHECK-GI-NEXT: add.4s v0, v3, v0
; CHECK-GI-NEXT: add.4s v0, v1, v0
Expand Down
Loading
Loading