Skip to content

Commit 4751d38

Browse files
committed
[AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x)))
This is a bit of an akward pattern that can come up as a result of legalization and then widening of i16 operations to i32 in RegBankSelect on AMDGPU. This quick combine avoids redundant patterns like ``` s_sext_i32_i8 s0, s0 s_sext_i32_i16 s0, s0 s_ashr_i32 s0, s0, s1 ``` With this the second sext is removed as it's redundant.
1 parent 65de524 commit 4751d38

File tree

3 files changed

+113
-63
lines changed

3 files changed

+113
-63
lines changed

llvm/include/llvm/Target/GlobalISel/Combine.td

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,14 @@ def sext_trunc_sextload : GICombineRule<
258258
[{ return Helper.matchSextTruncSextLoad(*${d}); }]),
259259
(apply [{ Helper.applySextTruncSextLoad(*${d}); }])>;
260260

261+
def sext_trunc_sextinreg : GICombineRule<
262+
(defs root:$dst),
263+
(match (G_SEXT_INREG $sir, $src, $width),
264+
(G_TRUNC $trunc, $sir),
265+
(G_SEXT $dst, $trunc),
266+
[{ return (MRI.getType(${trunc}.getReg()).getScalarSizeInBits() >= ${width}.getImm()); }]),
267+
(apply (GIReplaceReg $dst, $sir))>;
268+
261269
def sext_inreg_of_load_matchdata : GIDefMatchData<"std::tuple<Register, unsigned>">;
262270
def sext_inreg_of_load : GICombineRule<
263271
(defs root:$root, sext_inreg_of_load_matchdata:$matchinfo),
@@ -1896,7 +1904,9 @@ def cast_of_cast_combines: GICombineGroup<[
18961904
sext_of_anyext,
18971905
anyext_of_anyext,
18981906
anyext_of_zext,
1899-
anyext_of_sext
1907+
anyext_of_sext,
1908+
1909+
sext_trunc_sextinreg
19001910
]>;
19011911

19021912
def cast_combines: GICombineGroup<[
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
3+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
4+
5+
---
6+
name: trunc_s16_inreg_8
7+
tracksRegLiveness: true
8+
body: |
9+
bb.0:
10+
liveins: $vgpr0
11+
; CHECK-LABEL: name: trunc_s16_inreg_8
12+
; CHECK: liveins: $vgpr0
13+
; CHECK-NEXT: {{ $}}
14+
; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
15+
; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
16+
; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
17+
%copy:_(s32) = COPY $vgpr0
18+
%inreg:_(s32) = G_SEXT_INREG %copy, 8
19+
%trunc:_(s16) = G_TRUNC %inreg
20+
%sext:_(s32) = G_SEXT %trunc
21+
$vgpr0 = COPY %sext
22+
...
23+
24+
---
25+
name: trunc_s16_inreg_16
26+
tracksRegLiveness: true
27+
body: |
28+
bb.0:
29+
liveins: $vgpr0
30+
; CHECK-LABEL: name: trunc_s16_inreg_16
31+
; CHECK: liveins: $vgpr0
32+
; CHECK-NEXT: {{ $}}
33+
; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
34+
; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
35+
; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
36+
%copy:_(s32) = COPY $vgpr0
37+
%inreg:_(s32) = G_SEXT_INREG %copy, 16
38+
%trunc:_(s16) = G_TRUNC %inreg
39+
%sext:_(s32) = G_SEXT %trunc
40+
$vgpr0 = COPY %sext
41+
...
42+
43+
---
44+
name: trunc_s8_inreg_16
45+
tracksRegLiveness: true
46+
body: |
47+
bb.0:
48+
liveins: $vgpr0
49+
; CHECK-LABEL: name: trunc_s8_inreg_16
50+
; CHECK: liveins: $vgpr0
51+
; CHECK-NEXT: {{ $}}
52+
; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
53+
; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
54+
; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
55+
; CHECK-NEXT: %sext:_(s32) = G_SEXT %trunc(s8)
56+
; CHECK-NEXT: $vgpr0 = COPY %sext(s32)
57+
%copy:_(s32) = COPY $vgpr0
58+
%inreg:_(s32) = G_SEXT_INREG %copy, 16
59+
%trunc:_(s8) = G_TRUNC %inreg
60+
%sext:_(s32) = G_SEXT %trunc
61+
$vgpr0 = COPY %sext
62+
...
63+
64+
# TODO?: We could handle this by inserting a trunc, but I'm not sure how useful that'd be.
65+
---
66+
name: mismatching_types
67+
tracksRegLiveness: true
68+
body: |
69+
bb.0:
70+
liveins: $vgpr0
71+
; CHECK-LABEL: name: mismatching_types
72+
; CHECK: liveins: $vgpr0
73+
; CHECK-NEXT: {{ $}}
74+
; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
75+
; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
76+
; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
77+
; CHECK-NEXT: %sext:_(s16) = G_SEXT %trunc(s8)
78+
; CHECK-NEXT: %anyext:_(s32) = G_ANYEXT %sext(s16)
79+
; CHECK-NEXT: $vgpr0 = COPY %anyext(s32)
80+
%copy:_(s32) = COPY $vgpr0
81+
%inreg:_(s32) = G_SEXT_INREG %copy, 8
82+
%trunc:_(s8) = G_TRUNC %inreg
83+
%sext:_(s16) = G_SEXT %trunc
84+
%anyext:_(s32) = G_ANYEXT %sext
85+
$vgpr0 = COPY %anyext
86+
...

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll

Lines changed: 16 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -197,33 +197,13 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
197197
}
198198

199199
define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
200-
; GFX6-LABEL: abs_sgpr_v2i8:
201-
; GFX6: ; %bb.0:
202-
; GFX6-NEXT: s_sext_i32_i8 s0, s0
203-
; GFX6-NEXT: s_sext_i32_i8 s1, s1
204-
; GFX6-NEXT: s_abs_i32 s0, s0
205-
; GFX6-NEXT: s_abs_i32 s1, s1
206-
; GFX6-NEXT: ; return to shader part epilog
207-
;
208-
; GFX8-LABEL: abs_sgpr_v2i8:
209-
; GFX8: ; %bb.0:
210-
; GFX8-NEXT: s_sext_i32_i8 s0, s0
211-
; GFX8-NEXT: s_sext_i32_i8 s1, s1
212-
; GFX8-NEXT: s_sext_i32_i16 s0, s0
213-
; GFX8-NEXT: s_sext_i32_i16 s1, s1
214-
; GFX8-NEXT: s_abs_i32 s0, s0
215-
; GFX8-NEXT: s_abs_i32 s1, s1
216-
; GFX8-NEXT: ; return to shader part epilog
217-
;
218-
; GFX10-LABEL: abs_sgpr_v2i8:
219-
; GFX10: ; %bb.0:
220-
; GFX10-NEXT: s_sext_i32_i8 s0, s0
221-
; GFX10-NEXT: s_sext_i32_i8 s1, s1
222-
; GFX10-NEXT: s_sext_i32_i16 s0, s0
223-
; GFX10-NEXT: s_sext_i32_i16 s1, s1
224-
; GFX10-NEXT: s_abs_i32 s0, s0
225-
; GFX10-NEXT: s_abs_i32 s1, s1
226-
; GFX10-NEXT: ; return to shader part epilog
200+
; GFX-LABEL: abs_sgpr_v2i8:
201+
; GFX: ; %bb.0:
202+
; GFX-NEXT: s_sext_i32_i8 s0, s0
203+
; GFX-NEXT: s_sext_i32_i8 s1, s1
204+
; GFX-NEXT: s_abs_i32 s0, s0
205+
; GFX-NEXT: s_abs_i32 s1, s1
206+
; GFX-NEXT: ; return to shader part epilog
227207
%res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
228208
ret <2 x i8> %res
229209
}
@@ -268,41 +248,15 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
268248
}
269249

270250
define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
271-
; GFX6-LABEL: abs_sgpr_v3i8:
272-
; GFX6: ; %bb.0:
273-
; GFX6-NEXT: s_sext_i32_i8 s0, s0
274-
; GFX6-NEXT: s_sext_i32_i8 s1, s1
275-
; GFX6-NEXT: s_sext_i32_i8 s2, s2
276-
; GFX6-NEXT: s_abs_i32 s0, s0
277-
; GFX6-NEXT: s_abs_i32 s1, s1
278-
; GFX6-NEXT: s_abs_i32 s2, s2
279-
; GFX6-NEXT: ; return to shader part epilog
280-
;
281-
; GFX8-LABEL: abs_sgpr_v3i8:
282-
; GFX8: ; %bb.0:
283-
; GFX8-NEXT: s_sext_i32_i8 s0, s0
284-
; GFX8-NEXT: s_sext_i32_i8 s1, s1
285-
; GFX8-NEXT: s_sext_i32_i8 s2, s2
286-
; GFX8-NEXT: s_sext_i32_i16 s0, s0
287-
; GFX8-NEXT: s_sext_i32_i16 s1, s1
288-
; GFX8-NEXT: s_sext_i32_i16 s2, s2
289-
; GFX8-NEXT: s_abs_i32 s0, s0
290-
; GFX8-NEXT: s_abs_i32 s1, s1
291-
; GFX8-NEXT: s_abs_i32 s2, s2
292-
; GFX8-NEXT: ; return to shader part epilog
293-
;
294-
; GFX10-LABEL: abs_sgpr_v3i8:
295-
; GFX10: ; %bb.0:
296-
; GFX10-NEXT: s_sext_i32_i8 s0, s0
297-
; GFX10-NEXT: s_sext_i32_i8 s1, s1
298-
; GFX10-NEXT: s_sext_i32_i8 s2, s2
299-
; GFX10-NEXT: s_sext_i32_i16 s0, s0
300-
; GFX10-NEXT: s_sext_i32_i16 s1, s1
301-
; GFX10-NEXT: s_sext_i32_i16 s2, s2
302-
; GFX10-NEXT: s_abs_i32 s0, s0
303-
; GFX10-NEXT: s_abs_i32 s1, s1
304-
; GFX10-NEXT: s_abs_i32 s2, s2
305-
; GFX10-NEXT: ; return to shader part epilog
251+
; GFX-LABEL: abs_sgpr_v3i8:
252+
; GFX: ; %bb.0:
253+
; GFX-NEXT: s_sext_i32_i8 s0, s0
254+
; GFX-NEXT: s_sext_i32_i8 s1, s1
255+
; GFX-NEXT: s_sext_i32_i8 s2, s2
256+
; GFX-NEXT: s_abs_i32 s0, s0
257+
; GFX-NEXT: s_abs_i32 s1, s1
258+
; GFX-NEXT: s_abs_i32 s2, s2
259+
; GFX-NEXT: ; return to shader part epilog
306260
%res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
307261
ret <3 x i8> %res
308262
}

0 commit comments

Comments
 (0)