Skip to content

Commit 3b88805

Browse files
authored
[AMDGPU] Fix SDWA commuting (#106920)
SDWA insts miss reverse opcode, which causes them to be treated as commutable with default reverse opcode i.e. their own opcode. As a result, SWDA F16 sub A, B and Sub B, A are merged by machine CSE. The correct behavior is to merged sub A, B and subrev B, A instead of sub B, A. This issues caused failures in rocFFT tests. Another issue is that src0_sel and src1_sel are not swapped when SDWA insts are commuted. Verified that this fixes rocFFT tests failure.
1 parent 1e75d08 commit 3b88805

File tree

5 files changed

+131
-5
lines changed

5 files changed

+131
-5
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2788,6 +2788,9 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
27882788
swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
27892789
Src1, AMDGPU::OpName::src1_modifiers);
27902790

2791+
swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2792+
AMDGPU::OpName::src1_sel);
2793+
27912794
CommutedMI->setDesc(get(CommutedOpcode));
27922795
}
27932796

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,11 @@ multiclass VOP2Inst_e64<string opName,
167167
}
168168

169169
multiclass VOP2Inst_sdwa<string opName,
170-
VOPProfile P> {
170+
VOPProfile P,
171+
string revOp = opName> {
171172
if P.HasExtSDWA then
172-
def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
173+
def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
174+
Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
173175
}
174176

175177
multiclass VOP2Inst<string opName,
@@ -178,7 +180,7 @@ multiclass VOP2Inst<string opName,
178180
string revOp = opName> :
179181
VOP2Inst_e32<opName, P, node, revOp>,
180182
VOP2Inst_e64<opName, P, node, revOp>,
181-
VOP2Inst_sdwa<opName, P> {
183+
VOP2Inst_sdwa<opName, P, revOp> {
182184
if P.HasExtDPP then
183185
def _dpp : VOP2_DPP_Pseudo <opName, P>;
184186
}
@@ -225,7 +227,7 @@ multiclass VOP2Inst_VOPD<string opName,
225227
string revOp = opName> :
226228
VOP2Inst_e32_VOPD<opName, P, VOPDOp, VOPDName, node, revOp>,
227229
VOP2Inst_e64<opName, P, node, revOp>,
228-
VOP2Inst_sdwa<opName, P> {
230+
VOP2Inst_sdwa<opName, P, revOp> {
229231
if P.HasExtDPP then
230232
def _dpp : VOP2_DPP_Pseudo <opName, P>;
231233
}
@@ -243,7 +245,8 @@ multiclass VOP2bInst <string opName,
243245
}
244246

245247
if P.HasExtSDWA then
246-
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
248+
def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
249+
Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)> {
247250
let AsmMatchConverter = "cvtSdwaVOP2b";
248251
}
249252
if P.HasExtDPP then
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=machine-cse -verify-machineinstrs %s -o - 2>&1 | FileCheck --check-prefix=GCN %s
2+
3+
# GCN-LABEL: name: test_machine_cse_op_sel
4+
# GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
5+
# GCN: %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
6+
# GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
7+
---
8+
name: test_machine_cse_op_sel
9+
body: |
10+
bb.0:
11+
%0:vgpr_32 = IMPLICIT_DEF
12+
%1:vgpr_32 = IMPLICIT_DEF
13+
%2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
14+
%3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
15+
DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
16+
...
17+
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s
3+
4+
%ret_struct = type { half, half }
5+
6+
define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrspace(3) %arg1, ptr addrspace(3) %arg2, ptr addrspace(3) %arg3) {
7+
; CHECK-LABEL: extracted_values:
8+
; CHECK: ; %bb.0: ; %entry
9+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10+
; CHECK-NEXT: ds_read_b32 v3, v3
11+
; CHECK-NEXT: ds_read_b32 v4, v4
12+
; CHECK-NEXT: ds_read_b32 v2, v2
13+
; CHECK-NEXT: ds_read_b32 v5, v5
14+
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
15+
; CHECK-NEXT: v_sub_f16_sdwa v6, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
16+
; CHECK-NEXT: v_sub_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
17+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
18+
; CHECK-NEXT: v_sub_f16_sdwa v7, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
19+
; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
20+
; CHECK-NEXT: v_add_f16_e32 v4, v6, v7
21+
; CHECK-NEXT: v_add_f16_e32 v2, v3, v2
22+
; CHECK-NEXT: flat_store_short v[0:1], v4
23+
; CHECK-NEXT: flat_store_short v[0:1], v2 offset:2
24+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
25+
; CHECK-NEXT: s_setpc_b64 s[30:31]
26+
entry:
27+
%tmp0 = load <2 x half>, ptr addrspace(3) %arg1, align 4
28+
%tmp1 = extractelement <2 x half> %tmp0, i64 1
29+
%tmp2 = load <2 x half>, ptr addrspace(3) %arg2, align 4
30+
%tmp3 = extractelement <2 x half> %tmp2, i64 1
31+
%tmp4 = fsub contract half %tmp1, %tmp3
32+
%tmp5 = load <2 x half>, ptr addrspace(3) %arg0, align 4
33+
%tmp6 = extractelement <2 x half> %tmp5, i64 1
34+
%tmp7 = load <2 x half>, ptr addrspace(3) %arg3, align 4
35+
%tmp8 = extractelement <2 x half> %tmp7, i64 1
36+
%tmp9 = fsub contract half %tmp6, %tmp8
37+
%tmp10 = fadd contract half %tmp4, %tmp9
38+
%tmp11 = fsub contract half %tmp3, %tmp1
39+
%tmp12 = fsub contract half %tmp8, %tmp6
40+
%tmp13 = fadd contract half %tmp11, %tmp12
41+
%field_ptr = getelementptr %ret_struct, ptr %ret_struct, i32 0, i32 0
42+
store half %tmp10, ptr %field_ptr, align 2
43+
%field_ptr1 = getelementptr %ret_struct, ptr %ret_struct, i32 0, i32 1
44+
store half %tmp13, ptr %field_ptr1, align 2
45+
ret void
46+
}

llvm/test/CodeGen/AMDGPU/sdwa-cse.mir

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=machine-cse -verify-machineinstrs %s -o - 2>&1 | FileCheck --check-prefix=GCN %s
2+
3+
# GCN-LABEL: name: test_machine_cse_subtraction_sdwa_f16_no_merge
4+
# GCN: %2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
5+
# GCN: %3:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %1.sub0, 0, %0.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
6+
# GCN: %5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
7+
# GCN: %6:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %3, %4, implicit $mode, implicit $exec
8+
# GCN: DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %6, 0, 1, 0, implicit $exec
9+
---
10+
name: test_machine_cse_subtraction_sdwa_f16_no_merge
11+
body: |
12+
bb.0:
13+
%0:vreg_64 = IMPLICIT_DEF
14+
%1:vreg_64 = IMPLICIT_DEF
15+
%2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
16+
%3:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %1.sub0, 0, %0.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
17+
%4:vgpr_32 = IMPLICIT_DEF
18+
%5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
19+
%6:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %3, %4, implicit $mode, implicit $exec
20+
DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %6, 0, 1, 0, implicit $exec
21+
...
22+
23+
# GCN-LABEL: name: test_machine_cse_subtraction_sdwa_f16_merge_same_src_sel
24+
# GCN: %2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
25+
# GCN: %5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
26+
# GCN: DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %5, 0, 1, 0, implicit $exec
27+
---
28+
name: test_machine_cse_subtraction_sdwa_f16_merge_same_src_sel
29+
body: |
30+
bb.0:
31+
%0:vreg_64 = IMPLICIT_DEF
32+
%1:vreg_64 = IMPLICIT_DEF
33+
%2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
34+
%3:vgpr_32 = contract nofpexcept V_SUBREV_F16_sdwa 0, %1.sub0, 0, %0.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
35+
%4:vgpr_32 = IMPLICIT_DEF
36+
%5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
37+
%6:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %3, %4, implicit $mode, implicit $exec
38+
DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %6, 0, 1, 0, implicit $exec
39+
...
40+
41+
# GCN-LABEL: name: test_machine_cse_subtraction_sdwa_f16_merge_diff_src_sel
42+
# GCN: %2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 6, 5, implicit $mode, implicit $exec
43+
# GCN: %5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
44+
# GCN: DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %5, 0, 1, 0, implicit $exec
45+
---
46+
name: test_machine_cse_subtraction_sdwa_f16_merge_diff_src_sel
47+
body: |
48+
bb.0:
49+
%0:vreg_64 = IMPLICIT_DEF
50+
%1:vreg_64 = IMPLICIT_DEF
51+
%2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 6, 5, implicit $mode, implicit $exec
52+
%3:vgpr_32 = contract nofpexcept V_SUBREV_F16_sdwa 0, %1.sub0, 0, %0.sub0, 0, 0, 6, 0, 5, 6, implicit $mode, implicit $exec
53+
%4:vgpr_32 = IMPLICIT_DEF
54+
%5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
55+
%6:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %3, %4, implicit $mode, implicit $exec
56+
DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %6, 0, 1, 0, implicit $exec
57+
...

0 commit comments

Comments
 (0)