Skip to content

Commit ee646d7

Browse files
davemgreenIanWood1
authored andcommitted
[AArch64] Add tablegen patterns for i8 and i16 vector insert/extract pairs (llvm#136091)
An i8 and i16 vector extract/insert has to go via a i32 to make sure the types are legal. This patch adds patterns for extract from a i8/i16 vector, inserted into a i16/i32 vector. This avoids the round trip via a GPR which can limit performance.
1 parent f07db33 commit ee646d7

File tree

9 files changed

+467
-639
lines changed

9 files changed

+467
-639
lines changed

llvm/include/llvm/Target/TargetSelectionDAG.td

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -827,8 +827,11 @@ def step_vector : SDNode<"ISD::STEP_VECTOR", SDTypeProfile<1, 1,
827827
def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
828828
[]>;
829829

830-
// vector_extract/vector_insert are deprecated. extractelt/insertelt
831-
// are preferred.
830+
// vector_extract/vector_insert are similar to extractelt/insertelt but allow
831+
// types that require promotion (a 16i8 extract where i8 is not a legal type so
832+
// uses i32 for example). extractelt/insertelt are preferred where the element
833+
// type and the extracted types match due to the extra type checking they
834+
// perform.
832835
def vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
833836
SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>, []>;
834837
def vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT",

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7347,6 +7347,41 @@ def : Pat<(v2i32 (vector_insert v2i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))),
73477347
def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), (i64 imm:$Immd))),
73487348
(INSvi64lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$Sn, dsub), 0)>;
73497349

7350+
// Patterns for i8/i16 -> v2i32/v4i16 lane moves via insert and extract that go via i32.
7351+
multiclass Neon_INS_elt_ext_pattern<ValueType VT128, ValueType VT64, ValueType OutVT,
7352+
Instruction INS, SDNodeXForm VecIndexMult> {
7353+
// VT64->OutVT
7354+
def : Pat<(OutVT (vector_insert (OutVT V64:$src),
7355+
(i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))),
7356+
(i64 imm:$Immd))),
7357+
(EXTRACT_SUBREG
7358+
(INS (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$src, dsub), (VecIndexMult imm:$Immd),
7359+
(INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn),
7360+
dsub)>;
7361+
def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))))),
7362+
(EXTRACT_SUBREG
7363+
(INS (IMPLICIT_DEF), 0,
7364+
(INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn),
7365+
dsub)>;
7366+
7367+
// VT128->OutVT
7368+
def : Pat<(OutVT (vector_insert (OutVT V64:$src),
7369+
(i32 (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))),
7370+
(i64 imm:$Immd))),
7371+
(EXTRACT_SUBREG
7372+
(INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), (VecIndexMult imm:$Immd),
7373+
V128:$Rn, imm:$Immn),
7374+
dsub)>;
7375+
def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))))),
7376+
(EXTRACT_SUBREG
7377+
(INS (IMPLICIT_DEF), 0, V128:$Rn, imm:$Immn),
7378+
dsub)>;
7379+
}
7380+
7381+
defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v4i16, INSvi8lane, VecIndex_x2>;
7382+
defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v2i32, INSvi8lane, VecIndex_x4>;
7383+
defm : Neon_INS_elt_ext_pattern<v8i16, v4i16, v2i32, INSvi16lane, VecIndex_x2>;
7384+
73507385
// bitcast of an extract
73517386
// f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane))
73527387
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),

llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,11 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
1111
; CHECK-SDAG-NEXT: // kill: def $w0 killed $w0 def $x0
1212
; CHECK-SDAG-NEXT: // kill: def $d0 killed $d0 def $q0
1313
; CHECK-SDAG-NEXT: str d0, [sp, #8]
14-
; CHECK-SDAG-NEXT: umov w9, v0.b[1]
1514
; CHECK-SDAG-NEXT: bfxil x8, x0, #0, #3
1615
; CHECK-SDAG-NEXT: ld1 { v1.b }[0], [x8]
17-
; CHECK-SDAG-NEXT: umov w8, v0.b[2]
18-
; CHECK-SDAG-NEXT: mov v1.h[1], w9
19-
; CHECK-SDAG-NEXT: umov w9, v0.b[3]
20-
; CHECK-SDAG-NEXT: mov v1.h[2], w8
21-
; CHECK-SDAG-NEXT: mov v1.h[3], w9
16+
; CHECK-SDAG-NEXT: mov v1.b[2], v0.b[1]
17+
; CHECK-SDAG-NEXT: mov v1.b[4], v0.b[2]
18+
; CHECK-SDAG-NEXT: mov v1.b[6], v0.b[3]
2219
; CHECK-SDAG-NEXT: fmov d0, d1
2320
; CHECK-SDAG-NEXT: add sp, sp, #16
2421
; CHECK-SDAG-NEXT: ret
@@ -168,11 +165,10 @@ define <2 x i16> @test_varidx_extract_v4s16(<4 x i16> %x, i32 %idx) {
168165
; CHECK-SDAG-NEXT: // kill: def $w0 killed $w0 def $x0
169166
; CHECK-SDAG-NEXT: // kill: def $d0 killed $d0 def $q0
170167
; CHECK-SDAG-NEXT: str d0, [sp, #8]
171-
; CHECK-SDAG-NEXT: umov w9, v0.h[1]
172168
; CHECK-SDAG-NEXT: bfi x8, x0, #1, #2
173-
; CHECK-SDAG-NEXT: ld1 { v0.h }[0], [x8]
174-
; CHECK-SDAG-NEXT: mov v0.s[1], w9
175-
; CHECK-SDAG-NEXT: // kill: def $d0 killed $d0 killed $q0
169+
; CHECK-SDAG-NEXT: ld1 { v1.h }[0], [x8]
170+
; CHECK-SDAG-NEXT: mov v1.h[2], v0.h[1]
171+
; CHECK-SDAG-NEXT: fmov d0, d1
176172
; CHECK-SDAG-NEXT: add sp, sp, #16
177173
; CHECK-SDAG-NEXT: ret
178174
;

llvm/test/CodeGen/AArch64/bitcast-extend.ll

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -70,16 +70,12 @@ define <4 x i64> @z_i32_v4i64(i32 %x) {
7070
; CHECK-SD: // %bb.0:
7171
; CHECK-SD-NEXT: fmov s0, w0
7272
; CHECK-SD-NEXT: movi v1.2d, #0x000000000000ff
73-
; CHECK-SD-NEXT: umov w8, v0.b[2]
74-
; CHECK-SD-NEXT: umov w9, v0.b[0]
75-
; CHECK-SD-NEXT: umov w10, v0.b[3]
76-
; CHECK-SD-NEXT: umov w11, v0.b[1]
77-
; CHECK-SD-NEXT: fmov s0, w9
78-
; CHECK-SD-NEXT: fmov s2, w8
79-
; CHECK-SD-NEXT: mov v0.s[1], w11
80-
; CHECK-SD-NEXT: mov v2.s[1], w10
81-
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
82-
; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
73+
; CHECK-SD-NEXT: mov v2.b[0], v0.b[0]
74+
; CHECK-SD-NEXT: mov v3.b[0], v0.b[2]
75+
; CHECK-SD-NEXT: mov v2.b[4], v0.b[1]
76+
; CHECK-SD-NEXT: mov v3.b[4], v0.b[3]
77+
; CHECK-SD-NEXT: ushll v0.2d, v2.2s, #0
78+
; CHECK-SD-NEXT: ushll v2.2d, v3.2s, #0
8379
; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
8480
; CHECK-SD-NEXT: and v1.16b, v2.16b, v1.16b
8581
; CHECK-SD-NEXT: ret
@@ -176,16 +172,12 @@ define <4 x i64> @s_i32_v4i64(i32 %x) {
176172
; CHECK-SD-LABEL: s_i32_v4i64:
177173
; CHECK-SD: // %bb.0:
178174
; CHECK-SD-NEXT: fmov s0, w0
179-
; CHECK-SD-NEXT: umov w8, v0.b[2]
180-
; CHECK-SD-NEXT: umov w9, v0.b[0]
181-
; CHECK-SD-NEXT: umov w10, v0.b[3]
182-
; CHECK-SD-NEXT: umov w11, v0.b[1]
183-
; CHECK-SD-NEXT: fmov s0, w9
184-
; CHECK-SD-NEXT: fmov s1, w8
185-
; CHECK-SD-NEXT: mov v0.s[1], w11
186-
; CHECK-SD-NEXT: mov v1.s[1], w10
187-
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
188-
; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0
175+
; CHECK-SD-NEXT: mov v1.b[0], v0.b[0]
176+
; CHECK-SD-NEXT: mov v2.b[0], v0.b[2]
177+
; CHECK-SD-NEXT: mov v1.b[4], v0.b[1]
178+
; CHECK-SD-NEXT: mov v2.b[4], v0.b[3]
179+
; CHECK-SD-NEXT: ushll v0.2d, v1.2s, #0
180+
; CHECK-SD-NEXT: ushll v1.2d, v2.2s, #0
189181
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56
190182
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56
191183
; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #56

llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,12 @@
55
define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
66
; CHECKLE-LABEL: test_reconstructshuffle:
77
; CHECKLE: // %bb.0:
8-
; CHECKLE-NEXT: umov w8, v0.b[3]
9-
; CHECKLE-NEXT: umov w9, v0.b[2]
10-
; CHECKLE-NEXT: fmov s2, w8
11-
; CHECKLE-NEXT: umov w8, v0.b[1]
12-
; CHECKLE-NEXT: mov v2.h[1], w9
13-
; CHECKLE-NEXT: mov v2.h[2], w8
14-
; CHECKLE-NEXT: umov w8, v0.b[0]
15-
; CHECKLE-NEXT: ext v0.16b, v1.16b, v1.16b, #8
16-
; CHECKLE-NEXT: mov v2.h[3], w8
17-
; CHECKLE-NEXT: zip2 v0.8b, v0.8b, v0.8b
8+
; CHECKLE-NEXT: mov v2.b[0], v0.b[3]
9+
; CHECKLE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
10+
; CHECKLE-NEXT: mov v2.b[2], v0.b[2]
11+
; CHECKLE-NEXT: mov v2.b[4], v0.b[1]
12+
; CHECKLE-NEXT: mov v2.b[6], v0.b[0]
13+
; CHECKLE-NEXT: zip2 v0.8b, v1.8b, v0.8b
1814
; CHECKLE-NEXT: add v0.4h, v2.4h, v0.4h
1915
; CHECKLE-NEXT: bic v0.4h, #255, lsl #8
2016
; CHECKLE-NEXT: ret
@@ -25,16 +21,12 @@ define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
2521
; CHECKBE-NEXT: rev64 v1.16b, v1.16b
2622
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
2723
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
28-
; CHECKBE-NEXT: umov w8, v0.b[3]
29-
; CHECKBE-NEXT: umov w9, v0.b[2]
30-
; CHECKBE-NEXT: fmov s2, w8
31-
; CHECKBE-NEXT: umov w8, v0.b[1]
32-
; CHECKBE-NEXT: mov v2.h[1], w9
33-
; CHECKBE-NEXT: mov v2.h[2], w8
34-
; CHECKBE-NEXT: umov w8, v0.b[0]
35-
; CHECKBE-NEXT: ext v0.16b, v1.16b, v1.16b, #8
36-
; CHECKBE-NEXT: mov v2.h[3], w8
37-
; CHECKBE-NEXT: zip2 v0.8b, v0.8b, v0.8b
24+
; CHECKBE-NEXT: mov v2.b[0], v0.b[3]
25+
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
26+
; CHECKBE-NEXT: mov v2.b[2], v0.b[2]
27+
; CHECKBE-NEXT: mov v2.b[4], v0.b[1]
28+
; CHECKBE-NEXT: mov v2.b[6], v0.b[0]
29+
; CHECKBE-NEXT: zip2 v0.8b, v1.8b, v0.8b
3830
; CHECKBE-NEXT: add v0.4h, v2.4h, v0.4h
3931
; CHECKBE-NEXT: bic v0.4h, #255, lsl #8
4032
; CHECKBE-NEXT: rev64 v0.4h, v0.4h

0 commit comments

Comments
 (0)