Skip to content

Commit c0931d4

Browse files
committed
[AArch64][GlobalISel] Lower scalarizing G_UNMERGE_VALUES to G_EXTRACT_VECTOR_ELT
This adds post-legalizing lowering of G_UNMERGE_VALUES which take a vector and produce scalar values for each lane. They are converted to a G_EXTRACT_VECTOR_ELT for each lane, allowing all the existing tablegen patterns to apply to them. A couple of tablegen patterns need to be altered to make sure the type of the constant operand is known, so that the patterns are recognized under global isel. Closes #75662
1 parent 68fb3d5 commit c0931d4

14 files changed

+622
-919
lines changed

llvm/lib/Target/AArch64/AArch64Combine.td

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,14 @@ def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn,
134134
form_duplane,
135135
shuf_to_ins]>;
136136

137+
// Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's
138+
def vector_unmerge_lowering : GICombineRule <
139+
(defs root:$root),
140+
(match (wip_match_opcode G_UNMERGE_VALUES):$root,
141+
[{ return matchScalarizeVectorUnmerge(*${root}, MRI); }]),
142+
(apply [{ applyScalarizeVectorUnmerge(*${root}, MRI, B); }])
143+
>;
144+
137145
def adjust_icmp_imm_matchdata :
138146
GIDefMatchData<"std::pair<uint64_t, CmpInst::Predicate>">;
139147
def adjust_icmp_imm : GICombineRule <
@@ -251,7 +259,8 @@ def AArch64PostLegalizerLowering
251259
icmp_lowering, build_vector_lowering,
252260
lower_vector_fcmp, form_truncstore,
253261
vector_sext_inreg_to_shift,
254-
unmerge_ext_to_unmerge, lower_mull]> {
262+
unmerge_ext_to_unmerge, lower_mull,
263+
vector_unmerge_lowering]> {
255264
}
256265

257266
// Post-legalization combines which are primarily optimizations.

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6482,23 +6482,23 @@ def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))),
64826482
// f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane))
64836483
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),
64846484
(EXTRACT_SUBREG (INSvi32lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), ssub)>;
6485-
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, 0)))),
6485+
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, (i64 0))))),
64866486
(EXTRACT_SUBREG V128:$src, ssub)>;
64876487
def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))),
64886488
(EXTRACT_SUBREG (INSvi64lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), dsub)>;
6489-
def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, 0)))),
6489+
def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, (i64 0))))),
64906490
(EXTRACT_SUBREG V128:$src, dsub)>;
64916491

64926492
// Floating point vector extractions are codegen'd as either a sequence of
64936493
// subregister extractions, or a MOV (aka DUP here) if
64946494
// the lane number is anything other than zero.
6495-
def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
6495+
def : Pat<(f64 (vector_extract (v2f64 V128:$Rn), (i64 0))),
64966496
(f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
6497-
def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
6497+
def : Pat<(f32 (vector_extract (v4f32 V128:$Rn), (i64 0))),
64986498
(f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
6499-
def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
6499+
def : Pat<(f16 (vector_extract (v8f16 V128:$Rn), (i64 0))),
65006500
(f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
6501-
def : Pat<(vector_extract (v8bf16 V128:$Rn), 0),
6501+
def : Pat<(bf16 (vector_extract (v8bf16 V128:$Rn), (i64 0))),
65026502
(bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
65036503

65046504

llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,27 @@ void applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
769769
MI.eraseFromParent();
770770
}
771771

772+
bool matchScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI) {
773+
auto &Unmerge = cast<GUnmerge>(MI);
774+
Register Src1Reg = Unmerge.getReg(Unmerge.getNumOperands() - 1);
775+
const LLT SrcTy = MRI.getType(Src1Reg);
776+
return SrcTy.isVector() && !SrcTy.isScalable() &&
777+
Unmerge.getNumOperands() == (unsigned)SrcTy.getNumElements() + 1;
778+
}
779+
780+
void applyScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
781+
MachineIRBuilder &B) {
782+
auto &Unmerge = cast<GUnmerge>(MI);
783+
Register Src1Reg = Unmerge.getReg(Unmerge.getNumOperands() - 1);
784+
const LLT SrcTy = MRI.getType(Src1Reg);
785+
assert((SrcTy.isVector() && !SrcTy.isScalable()) &&
786+
"Expected a fixed length vector");
787+
788+
for (int I = 0; I < SrcTy.getNumElements(); ++I)
789+
B.buildExtractVectorElementConstant(Unmerge.getReg(I), Src1Reg, I);
790+
MI.eraseFromParent();
791+
}
792+
772793
bool matchBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI) {
773794
assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
774795
auto Splat = getAArch64VectorSplat(MI, MRI);

llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll

Lines changed: 76 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -516,20 +516,17 @@ define i8 @sminv_v4i8(<4 x i8> %a) {
516516
; CHECK-GI-LABEL: sminv_v4i8:
517517
; CHECK-GI: // %bb.0: // %entry
518518
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
519-
; CHECK-GI-NEXT: mov h1, v0.h[1]
520-
; CHECK-GI-NEXT: mov h2, v0.h[2]
521-
; CHECK-GI-NEXT: fmov w8, s0
522-
; CHECK-GI-NEXT: mov h3, v0.h[3]
519+
; CHECK-GI-NEXT: umov w8, v0.h[0]
520+
; CHECK-GI-NEXT: umov w9, v0.h[1]
521+
; CHECK-GI-NEXT: umov w10, v0.h[2]
522+
; CHECK-GI-NEXT: umov w12, v0.h[3]
523+
; CHECK-GI-NEXT: sxtb w11, w8
524+
; CHECK-GI-NEXT: cmp w11, w9, sxtb
525+
; CHECK-GI-NEXT: sxtb w11, w10
526+
; CHECK-GI-NEXT: csel w8, w8, w9, lt
527+
; CHECK-GI-NEXT: cmp w11, w12, sxtb
523528
; CHECK-GI-NEXT: sxtb w9, w8
524-
; CHECK-GI-NEXT: fmov w10, s1
525-
; CHECK-GI-NEXT: fmov w11, s2
526-
; CHECK-GI-NEXT: cmp w9, w10, sxtb
527-
; CHECK-GI-NEXT: sxtb w9, w11
528-
; CHECK-GI-NEXT: csel w8, w8, w10, lt
529-
; CHECK-GI-NEXT: fmov w10, s3
530-
; CHECK-GI-NEXT: cmp w9, w10, sxtb
531-
; CHECK-GI-NEXT: sxtb w9, w8
532-
; CHECK-GI-NEXT: csel w10, w11, w10, lt
529+
; CHECK-GI-NEXT: csel w10, w10, w12, lt
533530
; CHECK-GI-NEXT: cmp w9, w10, sxtb
534531
; CHECK-GI-NEXT: csel w0, w8, w10, lt
535532
; CHECK-GI-NEXT: ret
@@ -611,19 +608,16 @@ define i16 @sminv_v3i16(<3 x i16> %a) {
611608
; CHECK-GI: // %bb.0: // %entry
612609
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
613610
; CHECK-GI-NEXT: mov h1, v0.h[1]
614-
; CHECK-GI-NEXT: mov h2, v0.h[2]
615-
; CHECK-GI-NEXT: fmov w8, s0
616-
; CHECK-GI-NEXT: fmov w9, s0
617-
; CHECK-GI-NEXT: sxth w8, w8
618-
; CHECK-GI-NEXT: fmov w10, s1
619-
; CHECK-GI-NEXT: fmov w11, s2
611+
; CHECK-GI-NEXT: smov w8, v0.h[0]
612+
; CHECK-GI-NEXT: umov w9, v0.h[0]
613+
; CHECK-GI-NEXT: umov w10, v0.h[1]
614+
; CHECK-GI-NEXT: smov w11, v0.h[2]
615+
; CHECK-GI-NEXT: umov w13, v0.h[2]
620616
; CHECK-GI-NEXT: fmov w12, s1
621-
; CHECK-GI-NEXT: cmp w8, w10, sxth
622-
; CHECK-GI-NEXT: sxth w8, w11
623-
; CHECK-GI-NEXT: fmov w10, s2
624-
; CHECK-GI-NEXT: csel w9, w9, w12, lt
625-
; CHECK-GI-NEXT: cmp w8, w9, sxth
626-
; CHECK-GI-NEXT: csel w0, w9, w10, gt
617+
; CHECK-GI-NEXT: cmp w8, w12, sxth
618+
; CHECK-GI-NEXT: csel w8, w9, w10, lt
619+
; CHECK-GI-NEXT: cmp w11, w8, sxth
620+
; CHECK-GI-NEXT: csel w0, w8, w13, gt
627621
; CHECK-GI-NEXT: ret
628622
entry:
629623
%arg1 = call i16 @llvm.vector.reduce.smin.v3i16(<3 x i16> %a)
@@ -887,20 +881,17 @@ define i8 @smaxv_v4i8(<4 x i8> %a) {
887881
; CHECK-GI-LABEL: smaxv_v4i8:
888882
; CHECK-GI: // %bb.0: // %entry
889883
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
890-
; CHECK-GI-NEXT: mov h1, v0.h[1]
891-
; CHECK-GI-NEXT: mov h2, v0.h[2]
892-
; CHECK-GI-NEXT: fmov w8, s0
893-
; CHECK-GI-NEXT: mov h3, v0.h[3]
894-
; CHECK-GI-NEXT: sxtb w9, w8
895-
; CHECK-GI-NEXT: fmov w10, s1
896-
; CHECK-GI-NEXT: fmov w11, s2
897-
; CHECK-GI-NEXT: cmp w9, w10, sxtb
898-
; CHECK-GI-NEXT: sxtb w9, w11
899-
; CHECK-GI-NEXT: csel w8, w8, w10, gt
900-
; CHECK-GI-NEXT: fmov w10, s3
901-
; CHECK-GI-NEXT: cmp w9, w10, sxtb
884+
; CHECK-GI-NEXT: umov w8, v0.h[0]
885+
; CHECK-GI-NEXT: umov w9, v0.h[1]
886+
; CHECK-GI-NEXT: umov w10, v0.h[2]
887+
; CHECK-GI-NEXT: umov w12, v0.h[3]
888+
; CHECK-GI-NEXT: sxtb w11, w8
889+
; CHECK-GI-NEXT: cmp w11, w9, sxtb
890+
; CHECK-GI-NEXT: sxtb w11, w10
891+
; CHECK-GI-NEXT: csel w8, w8, w9, gt
892+
; CHECK-GI-NEXT: cmp w11, w12, sxtb
902893
; CHECK-GI-NEXT: sxtb w9, w8
903-
; CHECK-GI-NEXT: csel w10, w11, w10, gt
894+
; CHECK-GI-NEXT: csel w10, w10, w12, gt
904895
; CHECK-GI-NEXT: cmp w9, w10, sxtb
905896
; CHECK-GI-NEXT: csel w0, w8, w10, gt
906897
; CHECK-GI-NEXT: ret
@@ -982,19 +973,16 @@ define i16 @smaxv_v3i16(<3 x i16> %a) {
982973
; CHECK-GI: // %bb.0: // %entry
983974
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
984975
; CHECK-GI-NEXT: mov h1, v0.h[1]
985-
; CHECK-GI-NEXT: mov h2, v0.h[2]
986-
; CHECK-GI-NEXT: fmov w8, s0
987-
; CHECK-GI-NEXT: fmov w9, s0
988-
; CHECK-GI-NEXT: sxth w8, w8
989-
; CHECK-GI-NEXT: fmov w10, s1
990-
; CHECK-GI-NEXT: fmov w11, s2
976+
; CHECK-GI-NEXT: smov w8, v0.h[0]
977+
; CHECK-GI-NEXT: umov w9, v0.h[0]
978+
; CHECK-GI-NEXT: umov w10, v0.h[1]
979+
; CHECK-GI-NEXT: smov w11, v0.h[2]
980+
; CHECK-GI-NEXT: umov w13, v0.h[2]
991981
; CHECK-GI-NEXT: fmov w12, s1
992-
; CHECK-GI-NEXT: cmp w8, w10, sxth
993-
; CHECK-GI-NEXT: sxth w8, w11
994-
; CHECK-GI-NEXT: fmov w10, s2
995-
; CHECK-GI-NEXT: csel w9, w9, w12, gt
996-
; CHECK-GI-NEXT: cmp w8, w9, sxth
997-
; CHECK-GI-NEXT: csel w0, w9, w10, lt
982+
; CHECK-GI-NEXT: cmp w8, w12, sxth
983+
; CHECK-GI-NEXT: csel w8, w9, w10, gt
984+
; CHECK-GI-NEXT: cmp w11, w8, sxth
985+
; CHECK-GI-NEXT: csel w0, w8, w13, lt
998986
; CHECK-GI-NEXT: ret
999987
entry:
1000988
%arg1 = call i16 @llvm.vector.reduce.smax.v3i16(<3 x i16> %a)
@@ -1256,19 +1244,16 @@ define i8 @uminv_v4i8(<4 x i8> %a) {
12561244
; CHECK-GI-LABEL: uminv_v4i8:
12571245
; CHECK-GI: // %bb.0: // %entry
12581246
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1259-
; CHECK-GI-NEXT: mov h1, v0.h[1]
1260-
; CHECK-GI-NEXT: mov h2, v0.h[2]
1261-
; CHECK-GI-NEXT: mov h3, v0.h[3]
1262-
; CHECK-GI-NEXT: fmov w8, s0
1263-
; CHECK-GI-NEXT: fmov w10, s1
1264-
; CHECK-GI-NEXT: fmov w11, s2
1265-
; CHECK-GI-NEXT: fmov w12, s3
1266-
; CHECK-GI-NEXT: and w9, w8, #0xff
1267-
; CHECK-GI-NEXT: cmp w9, w10, uxtb
1268-
; CHECK-GI-NEXT: and w9, w11, #0xff
1269-
; CHECK-GI-NEXT: csel w8, w8, w10, lo
1270-
; CHECK-GI-NEXT: cmp w9, w12, uxtb
1271-
; CHECK-GI-NEXT: csel w9, w11, w12, lo
1247+
; CHECK-GI-NEXT: umov w8, v0.h[0]
1248+
; CHECK-GI-NEXT: umov w9, v0.h[1]
1249+
; CHECK-GI-NEXT: umov w10, v0.h[2]
1250+
; CHECK-GI-NEXT: umov w11, v0.h[3]
1251+
; CHECK-GI-NEXT: and w12, w8, #0xff
1252+
; CHECK-GI-NEXT: cmp w12, w9, uxtb
1253+
; CHECK-GI-NEXT: and w12, w10, #0xff
1254+
; CHECK-GI-NEXT: csel w8, w8, w9, lo
1255+
; CHECK-GI-NEXT: cmp w12, w11, uxtb
1256+
; CHECK-GI-NEXT: csel w9, w10, w11, lo
12721257
; CHECK-GI-NEXT: and w10, w8, #0xff
12731258
; CHECK-GI-NEXT: cmp w10, w9, uxtb
12741259
; CHECK-GI-NEXT: csel w0, w8, w9, lo
@@ -1351,19 +1336,16 @@ define i16 @uminv_v3i16(<3 x i16> %a) {
13511336
; CHECK-GI: // %bb.0: // %entry
13521337
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
13531338
; CHECK-GI-NEXT: mov h1, v0.h[1]
1354-
; CHECK-GI-NEXT: mov h2, v0.h[2]
1355-
; CHECK-GI-NEXT: fmov w8, s0
1356-
; CHECK-GI-NEXT: fmov w9, s0
1357-
; CHECK-GI-NEXT: uxth w8, w8
1358-
; CHECK-GI-NEXT: fmov w10, s1
1359-
; CHECK-GI-NEXT: fmov w11, s2
1339+
; CHECK-GI-NEXT: umov w8, v0.h[0]
1340+
; CHECK-GI-NEXT: umov w9, v0.h[0]
1341+
; CHECK-GI-NEXT: umov w10, v0.h[1]
1342+
; CHECK-GI-NEXT: umov w11, v0.h[2]
1343+
; CHECK-GI-NEXT: umov w13, v0.h[2]
13601344
; CHECK-GI-NEXT: fmov w12, s1
1361-
; CHECK-GI-NEXT: cmp w8, w10, uxth
1362-
; CHECK-GI-NEXT: uxth w8, w11
1363-
; CHECK-GI-NEXT: fmov w10, s2
1364-
; CHECK-GI-NEXT: csel w9, w9, w12, lo
1365-
; CHECK-GI-NEXT: cmp w8, w9, uxth
1366-
; CHECK-GI-NEXT: csel w0, w9, w10, hi
1345+
; CHECK-GI-NEXT: cmp w8, w12, uxth
1346+
; CHECK-GI-NEXT: csel w8, w9, w10, lo
1347+
; CHECK-GI-NEXT: cmp w11, w8, uxth
1348+
; CHECK-GI-NEXT: csel w0, w8, w13, hi
13671349
; CHECK-GI-NEXT: ret
13681350
entry:
13691351
%arg1 = call i16 @llvm.vector.reduce.umin.v3i16(<3 x i16> %a)
@@ -1625,19 +1607,16 @@ define i8 @umaxv_v4i8(<4 x i8> %a) {
16251607
; CHECK-GI-LABEL: umaxv_v4i8:
16261608
; CHECK-GI: // %bb.0: // %entry
16271609
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1628-
; CHECK-GI-NEXT: mov h1, v0.h[1]
1629-
; CHECK-GI-NEXT: mov h2, v0.h[2]
1630-
; CHECK-GI-NEXT: mov h3, v0.h[3]
1631-
; CHECK-GI-NEXT: fmov w8, s0
1632-
; CHECK-GI-NEXT: fmov w10, s1
1633-
; CHECK-GI-NEXT: fmov w11, s2
1634-
; CHECK-GI-NEXT: fmov w12, s3
1635-
; CHECK-GI-NEXT: and w9, w8, #0xff
1636-
; CHECK-GI-NEXT: cmp w9, w10, uxtb
1637-
; CHECK-GI-NEXT: and w9, w11, #0xff
1638-
; CHECK-GI-NEXT: csel w8, w8, w10, hi
1639-
; CHECK-GI-NEXT: cmp w9, w12, uxtb
1640-
; CHECK-GI-NEXT: csel w9, w11, w12, hi
1610+
; CHECK-GI-NEXT: umov w8, v0.h[0]
1611+
; CHECK-GI-NEXT: umov w9, v0.h[1]
1612+
; CHECK-GI-NEXT: umov w10, v0.h[2]
1613+
; CHECK-GI-NEXT: umov w11, v0.h[3]
1614+
; CHECK-GI-NEXT: and w12, w8, #0xff
1615+
; CHECK-GI-NEXT: cmp w12, w9, uxtb
1616+
; CHECK-GI-NEXT: and w12, w10, #0xff
1617+
; CHECK-GI-NEXT: csel w8, w8, w9, hi
1618+
; CHECK-GI-NEXT: cmp w12, w11, uxtb
1619+
; CHECK-GI-NEXT: csel w9, w10, w11, hi
16411620
; CHECK-GI-NEXT: and w10, w8, #0xff
16421621
; CHECK-GI-NEXT: cmp w10, w9, uxtb
16431622
; CHECK-GI-NEXT: csel w0, w8, w9, hi
@@ -1719,19 +1698,16 @@ define i16 @umaxv_v3i16(<3 x i16> %a) {
17191698
; CHECK-GI: // %bb.0: // %entry
17201699
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
17211700
; CHECK-GI-NEXT: mov h1, v0.h[1]
1722-
; CHECK-GI-NEXT: mov h2, v0.h[2]
1723-
; CHECK-GI-NEXT: fmov w8, s0
1724-
; CHECK-GI-NEXT: fmov w9, s0
1725-
; CHECK-GI-NEXT: uxth w8, w8
1726-
; CHECK-GI-NEXT: fmov w10, s1
1727-
; CHECK-GI-NEXT: fmov w11, s2
1701+
; CHECK-GI-NEXT: umov w8, v0.h[0]
1702+
; CHECK-GI-NEXT: umov w9, v0.h[0]
1703+
; CHECK-GI-NEXT: umov w10, v0.h[1]
1704+
; CHECK-GI-NEXT: umov w11, v0.h[2]
1705+
; CHECK-GI-NEXT: umov w13, v0.h[2]
17281706
; CHECK-GI-NEXT: fmov w12, s1
1729-
; CHECK-GI-NEXT: cmp w8, w10, uxth
1730-
; CHECK-GI-NEXT: uxth w8, w11
1731-
; CHECK-GI-NEXT: fmov w10, s2
1732-
; CHECK-GI-NEXT: csel w9, w9, w12, hi
1733-
; CHECK-GI-NEXT: cmp w8, w9, uxth
1734-
; CHECK-GI-NEXT: csel w0, w9, w10, lo
1707+
; CHECK-GI-NEXT: cmp w8, w12, uxth
1708+
; CHECK-GI-NEXT: csel w8, w9, w10, hi
1709+
; CHECK-GI-NEXT: cmp w11, w8, uxth
1710+
; CHECK-GI-NEXT: csel w0, w8, w13, lo
17351711
; CHECK-GI-NEXT: ret
17361712
entry:
17371713
%arg1 = call i16 @llvm.vector.reduce.umax.v3i16(<3 x i16> %a)

0 commit comments

Comments
 (0)