Skip to content

Commit 2f11ad0

Browse files
committed
AMDGPU: Do not try to commute instruction with same input register
There's little point to trying to commute an instruction if the two operands are already the same. This avoids an assertion in a future patch, but this likely isn't the correct fix. The worklist management in SIFoldOperands is dodgy, and we should probably fix it to work like PeepholeOpt (i.e. stop looking at use lists, and fold from users). This is an extension of the already handled special case which it's trying to avoid folding an instruction which is already being folded.
1 parent e5b8e8e commit 2f11ad0

File tree

6 files changed

+102
-27
lines changed

6 files changed

+102
-27
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -691,11 +691,21 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
691691
if (!CanCommute)
692692
return false;
693693

694+
MachineOperand &Op = MI->getOperand(OpNo);
695+
MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
696+
694697
// One of operands might be an Imm operand, and OpNo may refer to it after
695698
// the call of commuteInstruction() below. Such situations are avoided
696699
// here explicitly as OpNo must be a register operand to be a candidate
697700
// for memory folding.
698-
if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
701+
if (!Op.isReg() || !CommutedOp.isReg())
702+
return false;
703+
704+
// The same situation with an immediate could reproduce if both inputs are
705+
// the same register.
706+
if (Op.isReg() && CommutedOp.isReg() &&
707+
(Op.getReg() == CommutedOp.getReg() &&
708+
Op.getSubReg() == CommutedOp.getSubReg()))
699709
return false;
700710

701711
if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))

llvm/test/CodeGen/AMDGPU/dag-divergence.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %f
3737
; GCN-LABEL: {{^}}wide_carry_divergence_error:
3838
; GCN: v_sub_u32_e32
3939
; GCN: v_subb_u32_e32
40-
; GCN: v_subbrev_u32_e32
41-
; GCN: v_subbrev_u32_e32
40+
; GCN: v_subb_u32_e32
41+
; GCN: v_subb_u32_e32
4242
define <2 x i128> @wide_carry_divergence_error(i128 %arg) {
4343
%i = call i128 @llvm.ctlz.i128(i128 %arg, i1 false)
4444
%i1 = sub i128 0, %i

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
6565
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc
6666
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
6767
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc
68-
; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v5, vcc
69-
; GFX9-NEXT: v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc
68+
; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, 0, v5, vcc
69+
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v5, vcc
7070
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
7171
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
7272
; GFX9-NEXT: v_mov_b32_e32 v18, v16
@@ -2355,8 +2355,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
23552355
; GFX9-NEXT: v_sub_co_u32_e32 v12, vcc, v8, v9
23562356
; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v13, vcc
23572357
; GFX9-NEXT: v_mov_b32_e32 v8, 0
2358-
; GFX9-NEXT: v_subbrev_co_u32_e32 v14, vcc, 0, v8, vcc
2359-
; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v8, vcc
2358+
; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v8, vcc
2359+
; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, 0, v8, vcc
23602360
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[12:13]
23612361
; GFX9-NEXT: v_or_b32_e32 v10, v13, v15
23622362
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc

llvm/test/CodeGen/AMDGPU/div_v2i128.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,10 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
6666
; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
6767
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc
6868
; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2
69-
; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v18, vcc
69+
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc
7070
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3]
7171
; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
72-
; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v18, vcc
72+
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc
7373
; SDAG-NEXT: v_or_b32_e32 v8, v8, v10
7474
; SDAG-NEXT: v_or_b32_e32 v9, v3, v11
7575
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
@@ -263,10 +263,10 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
263263
; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13
264264
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc
265265
; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4
266-
; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v8, vcc
266+
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v8, vcc
267267
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5]
268268
; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
269-
; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v8, vcc
269+
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc
270270
; SDAG-NEXT: v_or_b32_e32 v8, v9, v10
271271
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
272272
; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
@@ -872,10 +872,10 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
872872
; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v18
873873
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc
874874
; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22
875-
; SDAG-NEXT: v_subbrev_u32_e32 v24, vcc, 0, v28, vcc
875+
; SDAG-NEXT: v_subb_u32_e32 v24, vcc, 0, v28, vcc
876876
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23]
877877
; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
878-
; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc
878+
; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v28, vcc
879879
; SDAG-NEXT: v_or_b32_e32 v16, v16, v24
880880
; SDAG-NEXT: v_or_b32_e32 v17, v23, v25
881881
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25]
@@ -1047,10 +1047,10 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
10471047
; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
10481048
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc
10491049
; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0
1050-
; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v24, vcc
1050+
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v24, vcc
10511051
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1]
10521052
; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
1053-
; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v24, vcc
1053+
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v24, vcc
10541054
; SDAG-NEXT: v_or_b32_e32 v2, v2, v20
10551055
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
10561056
; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
@@ -1619,10 +1619,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
16191619
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10
16201620
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc
16211621
; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10
1622-
; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc
1622+
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc
16231623
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
16241624
; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
1625-
; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc
1625+
; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v19, vcc
16261626
; SDAG-NEXT: v_or_b32_e32 v8, v8, v18
16271627
; SDAG-NEXT: v_or_b32_e32 v9, v11, v19
16281628
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
@@ -1814,10 +1814,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
18141814
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v19
18151815
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc
18161816
; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v10
1817-
; SDAG-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v18, vcc
1817+
; SDAG-NEXT: v_subb_u32_e32 v12, vcc, 0, v18, vcc
18181818
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
18191819
; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
1820-
; SDAG-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v18, vcc
1820+
; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v18, vcc
18211821
; SDAG-NEXT: v_or_b32_e32 v14, v14, v12
18221822
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
18231823
; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
@@ -2502,10 +2502,10 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
25022502
; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
25032503
; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc
25042504
; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18
2505-
; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc
2505+
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc
25062506
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19]
25072507
; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
2508-
; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc
2508+
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc
25092509
; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
25102510
; SDAG-NEXT: v_or_b32_e32 v17, v19, v21
25112511
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
@@ -2677,10 +2677,10 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
26772677
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18
26782678
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc
26792679
; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16
2680-
; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc
2680+
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc
26812681
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
26822682
; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
2683-
; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc
2683+
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc
26842684
; SDAG-NEXT: v_or_b32_e32 v18, v18, v20
26852685
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
26862686
; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc

llvm/test/CodeGen/AMDGPU/rem_i128.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
6666
; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc
6767
; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v7
6868
; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v11, vcc
69-
; GFX9-NEXT: v_subbrev_co_u32_e32 v8, vcc, 0, v9, vcc
70-
; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc
69+
; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
70+
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc
7171
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
7272
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
7373
; GFX9-NEXT: v_or_b32_e32 v12, v7, v9
@@ -1541,8 +1541,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
15411541
; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v9
15421542
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v10, v12, vcc
15431543
; GFX9-NEXT: v_mov_b32_e32 v11, 0
1544-
; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v11, vcc
1545-
; GFX9-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v11, vcc
1544+
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v11, vcc
1545+
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v11, vcc
15461546
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
15471547
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
15481548
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=peephole-opt,si-fold-operands -o - %s | FileCheck %s
3+
4+
# Check for assert when trying later folds after commuting an
5+
# instruction where both operands are the same register. This depended
6+
# on use list ordering, so we need to run peephole-opt first to
7+
# reproduce.
8+
9+
---
10+
name: commute_add_same_inputs_assert
11+
tracksRegLiveness: true
12+
body: |
13+
bb.0:
14+
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
15+
16+
; CHECK-LABEL: name: commute_add_same_inputs_assert
17+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
18+
; CHECK-NEXT: {{ $}}
19+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
20+
; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 0, killed [[COPY]], implicit-def $vcc, implicit $exec
21+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
22+
; CHECK-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $vcc, implicit $exec
23+
; CHECK-NEXT: $vgpr4 = COPY [[V_ADDC_U32_e32_]]
24+
; CHECK-NEXT: SI_RETURN implicit $vgpr4
25+
%0:vgpr_32 = COPY $vgpr0
26+
%1:sreg_64 = S_MOV_B64 0
27+
%2:sreg_32 = COPY %1.sub0
28+
%3:vgpr_32 = V_ADD_CO_U32_e32 %2, killed %0, implicit-def $vcc, implicit $exec
29+
%4:vgpr_32 = COPY %2
30+
%5:vgpr_32 = COPY %2
31+
%6:vgpr_32 = V_ADDC_U32_e32 %4, %5, implicit-def $vcc, implicit $vcc, implicit $exec
32+
$vgpr4 = COPY %6
33+
SI_RETURN implicit $vgpr4
34+
35+
...
36+
37+
---
38+
name: commute_sub_same_inputs_assert
39+
tracksRegLiveness: true
40+
body: |
41+
bb.0:
42+
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
43+
44+
; CHECK-LABEL: name: commute_sub_same_inputs_assert
45+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
46+
; CHECK-NEXT: {{ $}}
47+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
48+
; CHECK-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 0, killed [[COPY]], implicit-def $vcc, implicit $exec
49+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
50+
; CHECK-NEXT: [[V_SUBB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBB_U32_e32 0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $vcc, implicit $exec
51+
; CHECK-NEXT: $vgpr4 = COPY [[V_SUBB_U32_e32_]]
52+
; CHECK-NEXT: SI_RETURN implicit $vgpr4
53+
%0:vgpr_32 = COPY $vgpr0
54+
%1:sreg_64 = S_MOV_B64 0
55+
%2:sreg_32 = COPY %1.sub0
56+
%3:vgpr_32 = V_SUB_CO_U32_e32 %2, killed %0, implicit-def $vcc, implicit $exec
57+
%4:vgpr_32 = COPY %2
58+
%5:vgpr_32 = COPY %2
59+
%6:vgpr_32 = V_SUBB_U32_e32 %5, %4, implicit-def $vcc, implicit $vcc, implicit $exec
60+
$vgpr4 = COPY %6
61+
SI_RETURN implicit $vgpr4
62+
63+
...
64+
65+

0 commit comments

Comments
 (0)