Skip to content

Commit 3d354d0

Browse files
committed
[AArch64][GlobalISel] Combine MUL(AND(LSHR(X, 15), 0x10001), 0xffff) to CMLTz
This patch mirrors the following SelectionDAG patch for GlobalISel: https://reviews.llvm.org/D130874
1 parent 3ce9b86 commit 3d354d0

File tree

3 files changed

+90
-90
lines changed

3 files changed

+90
-90
lines changed

llvm/lib/Target/AArch64/AArch64Combine.td

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,14 @@ def or_to_bsp: GICombineRule <
265265
(apply [{ applyOrToBSP(*${root}, MRI, B, ${matchinfo}); }])
266266
>;
267267

268+
// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
269+
def combine_mul_cmlt : GICombineRule<
270+
(defs root:$root, register_matchinfo:$matchinfo),
271+
(match (wip_match_opcode G_MUL):$root,
272+
[{ return matchCombineMulCMLT(*${root}, MRI, ${matchinfo}); }]),
273+
(apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
274+
>;
275+
268276
// Post-legalization combines which should happen at all optimization levels.
269277
// (E.g. ones that facilitate matching for the selector) For example, matching
270278
// pseudos.
@@ -296,5 +304,6 @@ def AArch64PostLegalizerCombiner
296304
split_store_zero_128, undef_combines,
297305
select_to_minmax, or_to_bsp, combine_concat_vector,
298306
commute_constant_to_rhs,
299-
push_freeze_to_prevent_poison_from_propagating]> {
307+
push_freeze_to_prevent_poison_from_propagating,
308+
combine_mul_cmlt]> {
300309
}

llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,61 @@ void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
381381
MI.eraseFromParent();
382382
}
383383

384+
// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
385+
bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
386+
Register &SrcReg) {
387+
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
388+
389+
if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) &&
390+
DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) &&
391+
DstTy != LLT::fixed_vector(8, 16))
392+
return false;
393+
394+
auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
395+
if (AndMI->getOpcode() != TargetOpcode::G_AND)
396+
return false;
397+
auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI);
398+
if (LShrMI->getOpcode() != TargetOpcode::G_LSHR)
399+
return false;
400+
401+
// Check the constant splat values
402+
auto V1 = isConstantOrConstantSplatVector(
403+
*MRI.getVRegDef(MI.getOperand(2).getReg()), MRI);
404+
auto V2 = isConstantOrConstantSplatVector(
405+
*MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI);
406+
auto V3 = isConstantOrConstantSplatVector(
407+
*MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI);
408+
if (!V1.has_value() || !V2.has_value() || !V3.has_value())
409+
return false;
410+
unsigned HalfSize = DstTy.getScalarSizeInBits() / 2;
411+
if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) ||
412+
V3 != (HalfSize - 1))
413+
return false;
414+
415+
SrcReg = LShrMI->getOperand(1).getReg();
416+
417+
return true;
418+
}
419+
420+
void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
421+
MachineIRBuilder &B, Register &SrcReg) {
422+
Register DstReg = MI.getOperand(0).getReg();
423+
LLT DstTy = MRI.getType(DstReg);
424+
LLT HalfTy =
425+
DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2))
426+
.changeElementSize(DstTy.getScalarSizeInBits() / 2);
427+
428+
Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0);
429+
Register CastReg =
430+
B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0);
431+
Register CMLTReg =
432+
B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec)
433+
.getReg(0);
434+
435+
B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0);
436+
MI.eraseFromParent();
437+
}
438+
384439
class AArch64PostLegalizerCombinerImpl : public Combiner {
385440
protected:
386441
// TODO: Make CombinerHelper methods const.

llvm/test/CodeGen/AArch64/mulcmle.ll

Lines changed: 25 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -24,130 +24,66 @@ define <1 x i64> @v1i64(<1 x i64> %a) {
2424
}
2525

2626
define <2 x i64> @v2i64(<2 x i64> %a) {
27-
; CHECK-SD-LABEL: v2i64:
28-
; CHECK-SD: // %bb.0:
29-
; CHECK-SD-NEXT: cmlt v0.4s, v0.4s, #0
30-
; CHECK-SD-NEXT: ret
31-
;
32-
; CHECK-GI-LABEL: v2i64:
33-
; CHECK-GI: // %bb.0:
34-
; CHECK-GI-NEXT: movi v1.4s, #1
35-
; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #31
36-
; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff
37-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
38-
; CHECK-GI-NEXT: fmov x11, d2
39-
; CHECK-GI-NEXT: mov x9, v2.d[1]
40-
; CHECK-GI-NEXT: fmov x10, d0
41-
; CHECK-GI-NEXT: mov x8, v0.d[1]
42-
; CHECK-GI-NEXT: mul x10, x10, x11
43-
; CHECK-GI-NEXT: mul x8, x8, x9
44-
; CHECK-GI-NEXT: fmov d0, x10
45-
; CHECK-GI-NEXT: mov v0.d[1], x8
46-
; CHECK-GI-NEXT: ret
27+
; CHECK-LABEL: v2i64:
28+
; CHECK: // %bb.0:
29+
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
30+
; CHECK-NEXT: ret
4731
%b = lshr <2 x i64> %a, <i64 31, i64 31>
4832
%c = and <2 x i64> %b, <i64 4294967297, i64 4294967297>
4933
%d = mul nuw <2 x i64> %c, <i64 4294967295, i64 4294967295>
5034
ret <2 x i64> %d
5135
}
5236

5337
define <2 x i32> @v2i32(<2 x i32> %a) {
54-
; CHECK-SD-LABEL: v2i32:
55-
; CHECK-SD: // %bb.0:
56-
; CHECK-SD-NEXT: cmlt v0.4h, v0.4h, #0
57-
; CHECK-SD-NEXT: ret
58-
;
59-
; CHECK-GI-LABEL: v2i32:
60-
; CHECK-GI: // %bb.0:
61-
; CHECK-GI-NEXT: movi v1.4h, #1
62-
; CHECK-GI-NEXT: ushr v0.2s, v0.2s, #15
63-
; CHECK-GI-NEXT: movi d2, #0x00ffff0000ffff
64-
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
65-
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s
66-
; CHECK-GI-NEXT: ret
38+
; CHECK-LABEL: v2i32:
39+
; CHECK: // %bb.0:
40+
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
41+
; CHECK-NEXT: ret
6742
%b = lshr <2 x i32> %a, <i32 15, i32 15>
6843
%c = and <2 x i32> %b, <i32 65537, i32 65537>
6944
%d = mul nuw <2 x i32> %c, <i32 65535, i32 65535>
7045
ret <2 x i32> %d
7146
}
7247

7348
define <4 x i32> @v4i32(<4 x i32> %a) {
74-
; CHECK-SD-LABEL: v4i32:
75-
; CHECK-SD: // %bb.0:
76-
; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0
77-
; CHECK-SD-NEXT: ret
78-
;
79-
; CHECK-GI-LABEL: v4i32:
80-
; CHECK-GI: // %bb.0:
81-
; CHECK-GI-NEXT: movi v1.8h, #1
82-
; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15
83-
; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff
84-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
85-
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v2.4s
86-
; CHECK-GI-NEXT: ret
49+
; CHECK-LABEL: v4i32:
50+
; CHECK: // %bb.0:
51+
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
52+
; CHECK-NEXT: ret
8753
%b = lshr <4 x i32> %a, <i32 15, i32 15, i32 15, i32 15>
8854
%c = and <4 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537>
8955
%d = mul nuw <4 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535>
9056
ret <4 x i32> %d
9157
}
9258

9359
define <8 x i32> @v8i32(<8 x i32> %a) {
94-
; CHECK-SD-LABEL: v8i32:
95-
; CHECK-SD: // %bb.0:
96-
; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0
97-
; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0
98-
; CHECK-SD-NEXT: ret
99-
;
100-
; CHECK-GI-LABEL: v8i32:
101-
; CHECK-GI: // %bb.0:
102-
; CHECK-GI-NEXT: movi v2.8h, #1
103-
; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15
104-
; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #15
105-
; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff
106-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
107-
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
108-
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v3.4s
109-
; CHECK-GI-NEXT: mul v1.4s, v1.4s, v3.4s
110-
; CHECK-GI-NEXT: ret
60+
; CHECK-LABEL: v8i32:
61+
; CHECK: // %bb.0:
62+
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
63+
; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
64+
; CHECK-NEXT: ret
11165
%b = lshr <8 x i32> %a, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
11266
%c = and <8 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
11367
%d = mul nuw <8 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
11468
ret <8 x i32> %d
11569
}
11670

11771
define <4 x i16> @v4i16(<4 x i16> %a) {
118-
; CHECK-SD-LABEL: v4i16:
119-
; CHECK-SD: // %bb.0:
120-
; CHECK-SD-NEXT: cmlt v0.8b, v0.8b, #0
121-
; CHECK-SD-NEXT: ret
122-
;
123-
; CHECK-GI-LABEL: v4i16:
124-
; CHECK-GI: // %bb.0:
125-
; CHECK-GI-NEXT: movi v1.8b, #1
126-
; CHECK-GI-NEXT: ushr v0.4h, v0.4h, #7
127-
; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff
128-
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
129-
; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h
130-
; CHECK-GI-NEXT: ret
72+
; CHECK-LABEL: v4i16:
73+
; CHECK: // %bb.0:
74+
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
75+
; CHECK-NEXT: ret
13176
%b = lshr <4 x i16> %a, <i16 7, i16 7, i16 7, i16 7>
13277
%c = and <4 x i16> %b, <i16 257, i16 257, i16 257, i16 257>
13378
%d = mul nuw <4 x i16> %c, <i16 255, i16 255, i16 255, i16 255>
13479
ret <4 x i16> %d
13580
}
13681

13782
define <8 x i16> @v8i16(<8 x i16> %a) {
138-
; CHECK-SD-LABEL: v8i16:
139-
; CHECK-SD: // %bb.0:
140-
; CHECK-SD-NEXT: cmlt v0.16b, v0.16b, #0
141-
; CHECK-SD-NEXT: ret
142-
;
143-
; CHECK-GI-LABEL: v8i16:
144-
; CHECK-GI: // %bb.0:
145-
; CHECK-GI-NEXT: movi v1.16b, #1
146-
; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #7
147-
; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff
148-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
149-
; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h
150-
; CHECK-GI-NEXT: ret
83+
; CHECK-LABEL: v8i16:
84+
; CHECK: // %bb.0:
85+
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
86+
; CHECK-NEXT: ret
15187
%b = lshr <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
15288
%c = and <8 x i16> %b, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
15389
%d = mul nuw <8 x i16> %c, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>

0 commit comments

Comments
 (0)