Skip to content

Commit 14dc97d

Browse files
authored
[AArch64][GlobalISel] Push ADD/SUB through Extend Instructions (#90964)
The regression in one test is due to a SUB instruction being pushed through the extend, leaving behind the abs instruction, which prevents it from selecting uabdl instructions shown below: `i32 abs(i32 sub(i32 ext i8, i32 ext i8))` => `i32 abs(i32 ext(i16 sub(i16 ext i8, i16 ext i8)))` This is intended to be fixed in a follow up patch
1 parent e20f0fe commit 14dc97d

File tree

7 files changed

+494
-483
lines changed

7 files changed

+494
-483
lines changed

llvm/lib/Target/AArch64/AArch64Combine.td

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,31 @@ def ext_uaddv_to_uaddlv : GICombineRule<
5252
(apply [{ applyExtUaddvToUaddlv(*${root}, MRI, B, Observer, ${matchinfo}); }])
5353
>;
5454

55+
class push_opcode_through_ext<Instruction opcode, Instruction extOpcode> : GICombineRule <
56+
(defs root:$root),
57+
(match (extOpcode $ext1, $src1):$ExtMI,
58+
(extOpcode $ext2, $src2),
59+
(opcode $dst, $ext1, $ext2):$root,
60+
[{ return matchPushAddSubExt(*${root}, MRI, ${dst}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]),
61+
(apply [{ applyPushAddSubExt(*${root}, MRI, B, ${ExtMI}->getOpcode() == TargetOpcode::G_SEXT, ${dst}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }])>;
62+
63+
def push_sub_through_zext : push_opcode_through_ext<G_SUB, G_ZEXT>;
64+
def push_add_through_zext : push_opcode_through_ext<G_ADD, G_ZEXT>;
65+
def push_sub_through_sext : push_opcode_through_ext<G_SUB, G_SEXT>;
66+
def push_add_through_sext : push_opcode_through_ext<G_ADD, G_SEXT>;
67+
5568
def AArch64PreLegalizerCombiner: GICombiner<
5669
"AArch64PreLegalizerCombinerImpl", [all_combines,
5770
fconstant_to_constant,
5871
icmp_redundant_trunc,
5972
fold_global_offset,
6073
shuffle_to_extract,
6174
ext_addv_to_udot_addv,
62-
ext_uaddv_to_uaddlv]> {
75+
ext_uaddv_to_uaddlv,
76+
push_sub_through_zext,
77+
push_add_through_zext,
78+
push_sub_through_sext,
79+
push_add_through_sext]> {
6380
let CombineAllMethodName = "tryCombineAllImpl";
6481
}
6582

llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,57 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
554554
MI.eraseFromParent();
555555
}
556556

557+
// Pushes ADD/SUB through extend instructions to decrease the number of extend
558+
// instruction at the end by allowing selection of {s|u}addl sooner
559+
560+
// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
561+
bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
562+
Register DstReg, Register SrcReg1, Register SrcReg2) {
563+
assert(MI.getOpcode() == TargetOpcode::G_ADD ||
564+
MI.getOpcode() == TargetOpcode::G_SUB &&
565+
"Expected a G_ADD or G_SUB instruction\n");
566+
567+
// Deal with vector types only
568+
LLT DstTy = MRI.getType(DstReg);
569+
if (!DstTy.isVector())
570+
return false;
571+
572+
// Return true if G_{S|Z}EXT instruction is more than 2* source
573+
Register ExtDstReg = MI.getOperand(1).getReg();
574+
LLT Ext1SrcTy = MRI.getType(SrcReg1);
575+
LLT Ext2SrcTy = MRI.getType(SrcReg2);
576+
unsigned ExtDstScal = MRI.getType(ExtDstReg).getScalarSizeInBits();
577+
unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits();
578+
if (((Ext1SrcScal == 8 && ExtDstScal == 32) ||
579+
((Ext1SrcScal == 8 || Ext1SrcScal == 16) && ExtDstScal == 64)) &&
580+
Ext1SrcTy == Ext2SrcTy)
581+
return true;
582+
583+
return false;
584+
}
585+
586+
void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
587+
MachineIRBuilder &B, bool isSExt, Register DstReg,
588+
Register SrcReg1, Register SrcReg2) {
589+
LLT SrcTy = MRI.getType(SrcReg1);
590+
LLT MidTy = SrcTy.changeElementSize(SrcTy.getScalarSizeInBits() * 2);
591+
unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
592+
Register Ext1Reg = B.buildInstr(Opc, {MidTy}, {SrcReg1}).getReg(0);
593+
Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {SrcReg2}).getReg(0);
594+
Register AddReg =
595+
B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0);
596+
597+
// G_SUB has to sign-extend the result.
598+
// G_ADD needs to sext from sext and can sext or zext from zext, so the
599+
// original opcode is used.
600+
if (MI.getOpcode() == TargetOpcode::G_ADD)
601+
B.buildInstr(Opc, {DstReg}, {AddReg});
602+
else
603+
B.buildSExt(DstReg, AddReg);
604+
605+
MI.eraseFromParent();
606+
}
607+
557608
bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
558609
CombinerHelper &Helper, GISelChangeObserver &Observer) {
559610
// Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if

llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -219,10 +219,11 @@ body: |
219219
; CHECK-NEXT: {{ $}}
220220
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
221221
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
222-
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>)
223-
; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>)
224-
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[SEXT]], [[SEXT1]]
225-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>)
222+
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY]](<8 x s8>)
223+
; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY1]](<8 x s8>)
224+
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[SEXT]], [[SEXT1]]
225+
; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[ADD]](<8 x s16>)
226+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT2]](<8 x s32>)
226227
; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
227228
; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
228229
; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
@@ -249,10 +250,11 @@ body: |
249250
; CHECK-NEXT: {{ $}}
250251
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
251252
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
252-
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>)
253-
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>)
254-
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[ZEXT]], [[ZEXT1]]
255-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>)
253+
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY]](<8 x s8>)
254+
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY1]](<8 x s8>)
255+
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[ZEXT]], [[ZEXT1]]
256+
; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[ADD]](<8 x s16>)
257+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ZEXT2]](<8 x s32>)
256258
; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
257259
; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
258260
; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
@@ -279,10 +281,11 @@ body: |
279281
; CHECK-NEXT: {{ $}}
280282
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
281283
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
282-
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>)
283-
; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>)
284-
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[SEXT]], [[SEXT1]]
285-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>)
284+
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY]](<8 x s8>)
285+
; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY1]](<8 x s8>)
286+
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[SEXT]], [[SEXT1]]
287+
; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[SUB]](<8 x s16>)
288+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT2]](<8 x s32>)
286289
; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
287290
; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
288291
; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
@@ -309,10 +312,11 @@ body: |
309312
; CHECK-NEXT: {{ $}}
310313
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
311314
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
312-
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>)
313-
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>)
314-
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[ZEXT]], [[ZEXT1]]
315-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>)
315+
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY]](<8 x s8>)
316+
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY1]](<8 x s8>)
317+
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[ZEXT]], [[ZEXT1]]
318+
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[SUB]](<8 x s16>)
319+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT]](<8 x s32>)
316320
; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
317321
; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
318322
; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1

llvm/test/CodeGen/AArch64/aarch64-addv.ll

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -94,18 +94,19 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias
9494
;
9595
; GISEL-LABEL: oversized_ADDV_256:
9696
; GISEL: // %bb.0: // %entry
97-
; GISEL-NEXT: ldr d0, [x0]
98-
; GISEL-NEXT: ldr d1, [x1]
99-
; GISEL-NEXT: ushll v0.8h, v0.8b, #0
100-
; GISEL-NEXT: ushll v1.8h, v1.8b, #0
101-
; GISEL-NEXT: usubl v2.4s, v0.4h, v1.4h
102-
; GISEL-NEXT: usubl2 v0.4s, v0.8h, v1.8h
103-
; GISEL-NEXT: cmlt v1.4s, v2.4s, #0
104-
; GISEL-NEXT: cmlt v3.4s, v0.4s, #0
105-
; GISEL-NEXT: neg v4.4s, v2.4s
106-
; GISEL-NEXT: neg v5.4s, v0.4s
107-
; GISEL-NEXT: bsl v1.16b, v4.16b, v2.16b
108-
; GISEL-NEXT: bit v0.16b, v5.16b, v3.16b
97+
; GISEL-NEXT: ldr d1, [x0]
98+
; GISEL-NEXT: ldr d2, [x1]
99+
; GISEL-NEXT: movi v0.2d, #0000000000000000
100+
; GISEL-NEXT: usubl v1.8h, v1.8b, v2.8b
101+
; GISEL-NEXT: sshll v2.4s, v1.4h, #0
102+
; GISEL-NEXT: sshll2 v3.4s, v1.8h, #0
103+
; GISEL-NEXT: ssubw2 v0.4s, v0.4s, v1.8h
104+
; GISEL-NEXT: cmlt v4.4s, v2.4s, #0
105+
; GISEL-NEXT: cmlt v5.4s, v3.4s, #0
106+
; GISEL-NEXT: neg v6.4s, v2.4s
107+
; GISEL-NEXT: mov v1.16b, v4.16b
108+
; GISEL-NEXT: bif v0.16b, v3.16b, v5.16b
109+
; GISEL-NEXT: bsl v1.16b, v6.16b, v2.16b
109110
; GISEL-NEXT: add v0.4s, v1.4s, v0.4s
110111
; GISEL-NEXT: addv s0, v0.4s
111112
; GISEL-NEXT: fmov w0, s0

llvm/test/CodeGen/AArch64/arm64-vabs.ll

Lines changed: 42 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -289,26 +289,27 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
289289
;
290290
; CHECK-GI-LABEL: uabd16b_rdx_i32:
291291
; CHECK-GI: // %bb.0:
292-
; CHECK-GI-NEXT: ushll.8h v2, v0, #0
293-
; CHECK-GI-NEXT: ushll.8h v3, v1, #0
294-
; CHECK-GI-NEXT: ushll2.8h v0, v0, #0
295-
; CHECK-GI-NEXT: ushll2.8h v1, v1, #0
296-
; CHECK-GI-NEXT: usubl.4s v4, v2, v3
297-
; CHECK-GI-NEXT: usubl2.4s v2, v2, v3
298-
; CHECK-GI-NEXT: usubl.4s v3, v0, v1
299-
; CHECK-GI-NEXT: usubl2.4s v0, v0, v1
300-
; CHECK-GI-NEXT: cmlt.4s v1, v4, #0
301-
; CHECK-GI-NEXT: cmlt.4s v5, v2, #0
302-
; CHECK-GI-NEXT: neg.4s v16, v4
303-
; CHECK-GI-NEXT: cmlt.4s v6, v3, #0
304-
; CHECK-GI-NEXT: cmlt.4s v7, v0, #0
305-
; CHECK-GI-NEXT: neg.4s v17, v2
306-
; CHECK-GI-NEXT: neg.4s v18, v3
307-
; CHECK-GI-NEXT: neg.4s v19, v0
308-
; CHECK-GI-NEXT: bsl.16b v1, v16, v4
309-
; CHECK-GI-NEXT: bit.16b v2, v17, v5
310-
; CHECK-GI-NEXT: bit.16b v3, v18, v6
311-
; CHECK-GI-NEXT: bit.16b v0, v19, v7
292+
; CHECK-GI-NEXT: usubl.8h v3, v0, v1
293+
; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
294+
; CHECK-GI-NEXT: usubl2.8h v0, v0, v1
295+
; CHECK-GI-NEXT: sshll.4s v1, v3, #0
296+
; CHECK-GI-NEXT: sshll2.4s v4, v3, #0
297+
; CHECK-GI-NEXT: sshll.4s v5, v0, #0
298+
; CHECK-GI-NEXT: sshll2.4s v6, v0, #0
299+
; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3
300+
; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0
301+
; CHECK-GI-NEXT: cmlt.4s v2, v1, #0
302+
; CHECK-GI-NEXT: cmlt.4s v7, v4, #0
303+
; CHECK-GI-NEXT: neg.4s v16, v1
304+
; CHECK-GI-NEXT: cmlt.4s v17, v5, #0
305+
; CHECK-GI-NEXT: cmlt.4s v18, v6, #0
306+
; CHECK-GI-NEXT: neg.4s v19, v5
307+
; CHECK-GI-NEXT: bit.16b v1, v16, v2
308+
; CHECK-GI-NEXT: mov.16b v2, v7
309+
; CHECK-GI-NEXT: bif.16b v0, v6, v18
310+
; CHECK-GI-NEXT: bsl.16b v2, v3, v4
311+
; CHECK-GI-NEXT: mov.16b v3, v17
312+
; CHECK-GI-NEXT: bsl.16b v3, v19, v5
312313
; CHECK-GI-NEXT: add.4s v1, v1, v2
313314
; CHECK-GI-NEXT: add.4s v0, v3, v0
314315
; CHECK-GI-NEXT: add.4s v0, v1, v0
@@ -336,26 +337,27 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
336337
;
337338
; CHECK-GI-LABEL: sabd16b_rdx_i32:
338339
; CHECK-GI: // %bb.0:
339-
; CHECK-GI-NEXT: sshll.8h v2, v0, #0
340-
; CHECK-GI-NEXT: sshll.8h v3, v1, #0
341-
; CHECK-GI-NEXT: sshll2.8h v0, v0, #0
342-
; CHECK-GI-NEXT: sshll2.8h v1, v1, #0
343-
; CHECK-GI-NEXT: ssubl.4s v4, v2, v3
344-
; CHECK-GI-NEXT: ssubl2.4s v2, v2, v3
345-
; CHECK-GI-NEXT: ssubl.4s v3, v0, v1
346-
; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1
347-
; CHECK-GI-NEXT: cmlt.4s v1, v4, #0
348-
; CHECK-GI-NEXT: cmlt.4s v5, v2, #0
349-
; CHECK-GI-NEXT: neg.4s v16, v4
350-
; CHECK-GI-NEXT: cmlt.4s v6, v3, #0
351-
; CHECK-GI-NEXT: cmlt.4s v7, v0, #0
352-
; CHECK-GI-NEXT: neg.4s v17, v2
353-
; CHECK-GI-NEXT: neg.4s v18, v3
354-
; CHECK-GI-NEXT: neg.4s v19, v0
355-
; CHECK-GI-NEXT: bsl.16b v1, v16, v4
356-
; CHECK-GI-NEXT: bit.16b v2, v17, v5
357-
; CHECK-GI-NEXT: bit.16b v3, v18, v6
358-
; CHECK-GI-NEXT: bit.16b v0, v19, v7
340+
; CHECK-GI-NEXT: ssubl.8h v3, v0, v1
341+
; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
342+
; CHECK-GI-NEXT: ssubl2.8h v0, v0, v1
343+
; CHECK-GI-NEXT: sshll.4s v1, v3, #0
344+
; CHECK-GI-NEXT: sshll2.4s v4, v3, #0
345+
; CHECK-GI-NEXT: sshll.4s v5, v0, #0
346+
; CHECK-GI-NEXT: sshll2.4s v6, v0, #0
347+
; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3
348+
; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0
349+
; CHECK-GI-NEXT: cmlt.4s v2, v1, #0
350+
; CHECK-GI-NEXT: cmlt.4s v7, v4, #0
351+
; CHECK-GI-NEXT: neg.4s v16, v1
352+
; CHECK-GI-NEXT: cmlt.4s v17, v5, #0
353+
; CHECK-GI-NEXT: cmlt.4s v18, v6, #0
354+
; CHECK-GI-NEXT: neg.4s v19, v5
355+
; CHECK-GI-NEXT: bit.16b v1, v16, v2
356+
; CHECK-GI-NEXT: mov.16b v2, v7
357+
; CHECK-GI-NEXT: bif.16b v0, v6, v18
358+
; CHECK-GI-NEXT: bsl.16b v2, v3, v4
359+
; CHECK-GI-NEXT: mov.16b v3, v17
360+
; CHECK-GI-NEXT: bsl.16b v3, v19, v5
359361
; CHECK-GI-NEXT: add.4s v1, v1, v2
360362
; CHECK-GI-NEXT: add.4s v0, v3, v0
361363
; CHECK-GI-NEXT: add.4s v0, v1, v0

0 commit comments

Comments
 (0)