Skip to content

Commit 36bb17a

Browse files
authored
[AArch64] Utilize XAR for certain vector rotates (#137629)
Resolves #137162 For cases when there isn't any `XOR` in the transformation, replace with a zero register.
1 parent 95d440c commit 36bb17a

File tree

3 files changed

+197
-18
lines changed

3 files changed

+197
-18
lines changed

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4532,7 +4532,9 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
45324532

45334533
SDValue N0 = N->getOperand(0);
45344534
SDValue N1 = N->getOperand(1);
4535+
45354536
EVT VT = N->getValueType(0);
4537+
SDLoc DL(N);
45364538

45374539
// Essentially: rotr (xor(x, y), imm) -> xar (x, y, imm)
45384540
// Rotate by a constant is a funnel shift in IR which is exanded to
@@ -4558,10 +4560,18 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
45584560
!TLI->isAllActivePredicate(*CurDAG, N1.getOperand(0)))
45594561
return false;
45604562

4561-
SDValue XOR = N0.getOperand(1);
4562-
if (XOR.getOpcode() != ISD::XOR || XOR != N1.getOperand(1))
4563+
if (N0.getOperand(1) != N1.getOperand(1))
45634564
return false;
45644565

4566+
SDValue R1, R2;
4567+
bool IsXOROperand = true;
4568+
if (N0.getOperand(1).getOpcode() != ISD::XOR) {
4569+
IsXOROperand = false;
4570+
} else {
4571+
R1 = N0.getOperand(1).getOperand(0);
4572+
R2 = N1.getOperand(1).getOperand(1);
4573+
}
4574+
45654575
APInt ShlAmt, ShrAmt;
45664576
if (!ISD::isConstantSplatVector(N0.getOperand(2).getNode(), ShlAmt) ||
45674577
!ISD::isConstantSplatVector(N1.getOperand(2).getNode(), ShrAmt))
@@ -4570,11 +4580,23 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
45704580
if (ShlAmt + ShrAmt != VT.getScalarSizeInBits())
45714581
return false;
45724582

4573-
SDLoc DL(N);
4583+
if (!IsXOROperand) {
4584+
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i64);
4585+
SDNode *MOV = CurDAG->getMachineNode(AArch64::MOVIv2d_ns, DL, VT, Zero);
4586+
SDValue MOVIV = SDValue(MOV, 0);
4587+
4588+
SDValue ZSub = CurDAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
4589+
SDNode *SubRegToReg = CurDAG->getMachineNode(AArch64::SUBREG_TO_REG, DL,
4590+
VT, Zero, MOVIV, ZSub);
4591+
4592+
R1 = N1->getOperand(1);
4593+
R2 = SDValue(SubRegToReg, 0);
4594+
}
4595+
45744596
SDValue Imm =
45754597
CurDAG->getTargetConstant(ShrAmt.getZExtValue(), DL, MVT::i32);
45764598

4577-
SDValue Ops[] = {XOR.getOperand(0), XOR.getOperand(1), Imm};
4599+
SDValue Ops[] = {R1, R2, Imm};
45784600
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::Int>(
45794601
VT, {AArch64::XAR_ZZZI_B, AArch64::XAR_ZZZI_H, AArch64::XAR_ZZZI_S,
45804602
AArch64::XAR_ZZZI_D})) {
@@ -4591,24 +4613,36 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
45914613
N1->getOpcode() != AArch64ISD::VLSHR)
45924614
return false;
45934615

4594-
if (N0->getOperand(0) != N1->getOperand(0) ||
4595-
N1->getOperand(0)->getOpcode() != ISD::XOR)
4616+
if (N0->getOperand(0) != N1->getOperand(0))
45964617
return false;
45974618

4598-
SDValue XOR = N0.getOperand(0);
4599-
SDValue R1 = XOR.getOperand(0);
4600-
SDValue R2 = XOR.getOperand(1);
4619+
SDValue R1, R2;
4620+
bool IsXOROperand = true;
4621+
if (N1->getOperand(0)->getOpcode() != ISD::XOR) {
4622+
IsXOROperand = false;
4623+
} else {
4624+
SDValue XOR = N0.getOperand(0);
4625+
R1 = XOR.getOperand(0);
4626+
R2 = XOR.getOperand(1);
4627+
}
46014628

46024629
unsigned HsAmt = N0.getConstantOperandVal(1);
46034630
unsigned ShAmt = N1.getConstantOperandVal(1);
46044631

4605-
SDLoc DL = SDLoc(N0.getOperand(1));
46064632
SDValue Imm = CurDAG->getTargetConstant(
46074633
ShAmt, DL, N0.getOperand(1).getValueType(), false);
46084634

46094635
if (ShAmt + HsAmt != 64)
46104636
return false;
46114637

4638+
if (!IsXOROperand) {
4639+
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i64);
4640+
SDNode *MOV = CurDAG->getMachineNode(AArch64::MOVIv2d_ns, DL, VT, Zero);
4641+
SDValue MOVIV = SDValue(MOV, 0);
4642+
R1 = N1->getOperand(0);
4643+
R2 = MOVIV;
4644+
}
4645+
46124646
SDValue Ops[] = {R1, R2, Imm};
46134647
CurDAG->SelectNodeTo(N, AArch64::XAR, N0.getValueType(), Ops);
46144648

llvm/test/CodeGen/AArch64/sve2-xar.ll

Lines changed: 75 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -169,19 +169,86 @@ define <vscale x 2 x i64> @xar_nxv2i64_l_neg1(<vscale x 2 x i64> %x, <vscale x 2
169169

170170
; OR instead of an XOR.
171171
; TODO: We could use usra instruction here for SVE2.
172-
define <vscale x 2 x i64> @xar_nxv2i64_l_neg2(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
173-
; CHECK-LABEL: xar_nxv2i64_l_neg2:
174-
; CHECK: // %bb.0:
175-
; CHECK-NEXT: orr z0.d, z0.d, z1.d
176-
; CHECK-NEXT: lsr z1.d, z0.d, #4
177-
; CHECK-NEXT: lsl z0.d, z0.d, #60
178-
; CHECK-NEXT: orr z0.d, z0.d, z1.d
179-
; CHECK-NEXT: ret
172+
define <vscale x 2 x i64> @xar_nxv2i64_l_neg2_1(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
173+
; SVE-LABEL: xar_nxv2i64_l_neg2_1:
174+
; SVE: // %bb.0:
175+
; SVE-NEXT: orr z0.d, z0.d, z1.d
176+
; SVE-NEXT: lsr z1.d, z0.d, #4
177+
; SVE-NEXT: lsl z0.d, z0.d, #60
178+
; SVE-NEXT: orr z0.d, z0.d, z1.d
179+
; SVE-NEXT: ret
180+
;
181+
; SVE2-LABEL: xar_nxv2i64_l_neg2_1:
182+
; SVE2: // %bb.0:
183+
; SVE2-NEXT: movi v2.2d, #0000000000000000
184+
; SVE2-NEXT: orr z0.d, z0.d, z1.d
185+
; SVE2-NEXT: xar z0.d, z0.d, z2.d, #4
186+
; SVE2-NEXT: ret
180187
%a = or <vscale x 2 x i64> %x, %y
181188
%b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 60))
182189
ret <vscale x 2 x i64> %b
183190
}
184191

192+
define <vscale x 4 x i32> @xar_nxv2i32_l_neg2_2(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
193+
; SVE-LABEL: xar_nxv2i32_l_neg2_2:
194+
; SVE: // %bb.0:
195+
; SVE-NEXT: orr z0.d, z0.d, z1.d
196+
; SVE-NEXT: lsr z1.s, z0.s, #4
197+
; SVE-NEXT: lsl z0.s, z0.s, #28
198+
; SVE-NEXT: orr z0.d, z0.d, z1.d
199+
; SVE-NEXT: ret
200+
;
201+
; SVE2-LABEL: xar_nxv2i32_l_neg2_2:
202+
; SVE2: // %bb.0:
203+
; SVE2-NEXT: movi v2.2d, #0000000000000000
204+
; SVE2-NEXT: orr z0.d, z0.d, z1.d
205+
; SVE2-NEXT: xar z0.s, z0.s, z2.s, #4
206+
; SVE2-NEXT: ret
207+
%a = or <vscale x 4 x i32> %x, %y
208+
%b = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 60))
209+
ret <vscale x 4 x i32> %b
210+
}
211+
212+
define <vscale x 8 x i16> @xar_nxv2i16_l_neg2_3(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
213+
; SVE-LABEL: xar_nxv2i16_l_neg2_3:
214+
; SVE: // %bb.0:
215+
; SVE-NEXT: orr z0.d, z0.d, z1.d
216+
; SVE-NEXT: lsr z1.h, z0.h, #4
217+
; SVE-NEXT: lsl z0.h, z0.h, #12
218+
; SVE-NEXT: orr z0.d, z0.d, z1.d
219+
; SVE-NEXT: ret
220+
;
221+
; SVE2-LABEL: xar_nxv2i16_l_neg2_3:
222+
; SVE2: // %bb.0:
223+
; SVE2-NEXT: movi v2.2d, #0000000000000000
224+
; SVE2-NEXT: orr z0.d, z0.d, z1.d
225+
; SVE2-NEXT: xar z0.h, z0.h, z2.h, #4
226+
; SVE2-NEXT: ret
227+
%a = or <vscale x 8 x i16> %x, %y
228+
%b = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 60))
229+
ret <vscale x 8 x i16> %b
230+
}
231+
232+
define <vscale x 16 x i8> @xar_nxv2i8_l_neg2_4(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
233+
; SVE-LABEL: xar_nxv2i8_l_neg2_4:
234+
; SVE: // %bb.0:
235+
; SVE-NEXT: orr z0.d, z0.d, z1.d
236+
; SVE-NEXT: lsr z1.b, z0.b, #4
237+
; SVE-NEXT: lsl z0.b, z0.b, #4
238+
; SVE-NEXT: orr z0.d, z0.d, z1.d
239+
; SVE-NEXT: ret
240+
;
241+
; SVE2-LABEL: xar_nxv2i8_l_neg2_4:
242+
; SVE2: // %bb.0:
243+
; SVE2-NEXT: movi v2.2d, #0000000000000000
244+
; SVE2-NEXT: orr z0.d, z0.d, z1.d
245+
; SVE2-NEXT: xar z0.b, z0.b, z2.b, #4
246+
; SVE2-NEXT: ret
247+
%a = or <vscale x 16 x i8> %x, %y
248+
%b = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 60))
249+
ret <vscale x 16 x i8> %b
250+
}
251+
185252
; Rotate amount is 0.
186253
define <vscale x 2 x i64> @xar_nxv2i64_l_neg3(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
187254
; CHECK-LABEL: xar_nxv2i64_l_neg3:

llvm/test/CodeGen/AArch64/xar.ll

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,82 @@ define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) {
1919
ret <2 x i64> %b
2020
}
2121

22+
define <2 x i64> @xar_instead_of_or1(<2 x i64> %r) {
23+
; SHA3-LABEL: xar_instead_of_or1:
24+
; SHA3: // %bb.0: // %entry
25+
; SHA3-NEXT: movi v1.2d, #0000000000000000
26+
; SHA3-NEXT: xar v0.2d, v0.2d, v1.2d, #39
27+
; SHA3-NEXT: ret
28+
;
29+
; NOSHA3-LABEL: xar_instead_of_or1:
30+
; NOSHA3: // %bb.0: // %entry
31+
; NOSHA3-NEXT: shl v1.2d, v0.2d, #25
32+
; NOSHA3-NEXT: usra v1.2d, v0.2d, #39
33+
; NOSHA3-NEXT: mov v0.16b, v1.16b
34+
; NOSHA3-NEXT: ret
35+
entry:
36+
%or = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 25))
37+
ret <2 x i64> %or
38+
}
39+
40+
define <4 x i32> @xar_instead_of_or2(<4 x i32> %r) {
41+
; SHA3-LABEL: xar_instead_of_or2:
42+
; SHA3: // %bb.0: // %entry
43+
; SHA3-NEXT: shl v1.4s, v0.4s, #25
44+
; SHA3-NEXT: usra v1.4s, v0.4s, #7
45+
; SHA3-NEXT: mov v0.16b, v1.16b
46+
; SHA3-NEXT: ret
47+
;
48+
; NOSHA3-LABEL: xar_instead_of_or2:
49+
; NOSHA3: // %bb.0: // %entry
50+
; NOSHA3-NEXT: shl v1.4s, v0.4s, #25
51+
; NOSHA3-NEXT: usra v1.4s, v0.4s, #7
52+
; NOSHA3-NEXT: mov v0.16b, v1.16b
53+
; NOSHA3-NEXT: ret
54+
entry:
55+
%or = call <4 x i32> @llvm.fshl.v2i32(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 25))
56+
ret <4 x i32> %or
57+
}
58+
59+
define <8 x i16> @xar_instead_of_or3(<8 x i16> %r) {
60+
; SHA3-LABEL: xar_instead_of_or3:
61+
; SHA3: // %bb.0: // %entry
62+
; SHA3-NEXT: shl v1.8h, v0.8h, #9
63+
; SHA3-NEXT: usra v1.8h, v0.8h, #7
64+
; SHA3-NEXT: mov v0.16b, v1.16b
65+
; SHA3-NEXT: ret
66+
;
67+
; NOSHA3-LABEL: xar_instead_of_or3:
68+
; NOSHA3: // %bb.0: // %entry
69+
; NOSHA3-NEXT: shl v1.8h, v0.8h, #9
70+
; NOSHA3-NEXT: usra v1.8h, v0.8h, #7
71+
; NOSHA3-NEXT: mov v0.16b, v1.16b
72+
; NOSHA3-NEXT: ret
73+
entry:
74+
%or = call <8 x i16> @llvm.fshl.v2i16(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 25))
75+
ret <8 x i16> %or
76+
}
77+
78+
define <16 x i8> @xar_instead_of_or4(<16 x i8> %r) {
79+
; SHA3-LABEL: xar_instead_of_or4:
80+
; SHA3: // %bb.0: // %entry
81+
; SHA3-NEXT: add v1.16b, v0.16b, v0.16b
82+
; SHA3-NEXT: usra v1.16b, v0.16b, #7
83+
; SHA3-NEXT: mov v0.16b, v1.16b
84+
; SHA3-NEXT: ret
85+
;
86+
; NOSHA3-LABEL: xar_instead_of_or4:
87+
; NOSHA3: // %bb.0: // %entry
88+
; NOSHA3-NEXT: add v1.16b, v0.16b, v0.16b
89+
; NOSHA3-NEXT: usra v1.16b, v0.16b, #7
90+
; NOSHA3-NEXT: mov v0.16b, v1.16b
91+
; NOSHA3-NEXT: ret
92+
entry:
93+
%or = call <16 x i8> @llvm.fshl.v2i8(<16 x i8> %r, <16 x i8> %r, <16 x i8> splat (i8 25))
94+
ret <16 x i8> %or
95+
}
96+
2297
declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
98+
declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
99+
declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
100+
declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)

0 commit comments

Comments
 (0)