Skip to content

Commit 77cfa4f

Browse files
arsenmpravinjagtap
authored andcommitted
AMDGPU: Handle gfx950 valu write vdst + permlane read hazard (llvm#117287)
1 parent 1b17ae2 commit 77cfa4f

File tree

4 files changed

+153
-2
lines changed

4 files changed

+153
-2
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2547,8 +2547,34 @@ int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
25472547
return isVCmpXWritesExec(*TII, *TRI, MI);
25482548
};
25492549

2550-
const int NumWaitStates = 4;
2551-
return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates);
2550+
auto IsVALUFn = [](const MachineInstr &MI) {
2551+
return SIInstrInfo::isVALU(MI);
2552+
};
2553+
2554+
const int VCmpXWritesExecWaitStates = 4;
2555+
const int VALUWritesVDstWaitStates = 2;
2556+
int WaitStatesNeeded = 0;
2557+
2558+
for (const MachineOperand &Op : MI->explicit_uses()) {
2559+
if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2560+
continue;
2561+
Register Reg = Op.getReg();
2562+
2563+
int WaitStatesSinceDef =
2564+
VALUWritesVDstWaitStates -
2565+
getWaitStatesSinceDef(Reg, IsVALUFn,
2566+
/*MaxWaitStates=*/VALUWritesVDstWaitStates);
2567+
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2568+
if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2569+
break;
2570+
}
2571+
2572+
int VCmpXHazardWaits =
2573+
VCmpXWritesExecWaitStates -
2574+
getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2575+
2576+
WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2577+
return WaitStatesNeeded;
25522578
}
25532579

25542580
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {

llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,116 @@ body: |
142142
$vgpr4 = V_MOV_B32_e32 0, implicit $exec
143143
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
144144
...
145+
146+
---
147+
# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0
148+
# GCN: V_MOV_B32
149+
# GCN-NEXT: S_NOP 1
150+
# GCN-NEXT: V_PERMLANE
151+
name: valu_write_vdst_read_permlane16_swap_0
152+
body: |
153+
bb.0:
154+
liveins: $vgpr1
155+
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
156+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
157+
...
158+
159+
---
160+
# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1
161+
# GCN: V_MOV_B32
162+
# GCN-NEXT: S_NOP 1
163+
# GCN-NEXT: V_PERMLANE
164+
name: valu_write_vdst_read_permlane16_swap_1
165+
body: |
166+
bb.0:
167+
liveins: $vgpr0
168+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
169+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
170+
...
171+
172+
---
173+
# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0
174+
# GCN: V_MOV_B32
175+
# GCN-NEXT: S_NOP 1
176+
# GCN-NEXT: V_PERMLANE
177+
name: valu_write_vdst_read_permlane32_swap_0
178+
body: |
179+
bb.0:
180+
liveins: $vgpr1
181+
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
182+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
183+
...
184+
185+
---
186+
# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1
187+
# GCN: V_MOV_B32
188+
# GCN-NEXT: S_NOP 1
189+
# GCN-NEXT: V_PERMLANE
190+
name: valu_write_vdst_read_permlane32_swap_1
191+
body: |
192+
bb.0:
193+
liveins: $vgpr0
194+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
195+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
196+
...
197+
198+
---
199+
# No hazard, write of other register
200+
# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg
201+
# GCN: V_MOV_B32
202+
# GCN-NEXT: V_PERMLANE
203+
name: valu_write_vdst_read_permlane16_swap_0_otherreg
204+
body: |
205+
bb.0:
206+
liveins: $vgpr1
207+
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
208+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
209+
...
210+
211+
---
212+
# Both permlane hazards at once.
213+
# GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
214+
# GCN: V_MOV_B32
215+
# GCN: V_CMPX_EQ_I32
216+
# GCN-NEXT: S_NOP 3
217+
# GCN-NEXT: V_PERMLANE
218+
name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
219+
body: |
220+
bb.0:
221+
liveins: $vgpr0, $vgpr2, $vgpr3
222+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
223+
$exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
224+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
225+
...
226+
227+
---
228+
# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
229+
# GCN: V_CMPX_EQ_I32
230+
# GCN: V_MOV_B32
231+
# GCN-NEXT: S_NOP 2
232+
# GCN-NEXT: V_PERMLANE
233+
name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
234+
body: |
235+
bb.0:
236+
liveins: $vgpr0, $vgpr2, $vgpr3
237+
$exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
238+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
239+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
240+
...
241+
242+
---
243+
# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
244+
# GCN: V_CMPX_EQ_I32
245+
# GCN: V_MOV_B32
246+
# GCN: V_MOV_B32
247+
# GCN-NEXT: S_NOP 1
248+
# GCN-NEXT: V_PERMLANE
249+
name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
250+
body: |
251+
bb.0:
252+
liveins: $vgpr0, $vgpr2, $vgpr3
253+
$exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
254+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
255+
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
256+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
257+
...

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
2626
; GCN: ; %bb.0:
2727
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2828
; GCN-NEXT: v_mov_b32_e32 v1, 1
29+
; GCN-NEXT: s_nop 1
2930
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
3031
; GCN-NEXT: s_setpc_b64 s[30:31]
3132
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
3738
; GCN: ; %bb.0:
3839
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3940
; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1
41+
; GCN-NEXT: s_nop 1
4042
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
4143
; GCN-NEXT: s_setpc_b64 s[30:31]
4244
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
4951
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5052
; GCN-NEXT: v_mov_b32_e32 v1, v0
5153
; GCN-NEXT: v_mov_b32_e32 v0, 1
54+
; GCN-NEXT: s_nop 1
5255
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
5356
; GCN-NEXT: s_setpc_b64 s[30:31]
5457
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
6164
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6265
; GCN-NEXT: v_mov_b32_e32 v0, s0
6366
; GCN-NEXT: v_mov_b32_e32 v1, s1
67+
; GCN-NEXT: s_nop 1
6468
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
6569
; GCN-NEXT: s_setpc_b64 s[30:31]
6670
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
7377
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7478
; GCN-NEXT: v_mov_b32_e32 v1, v0
7579
; GCN-NEXT: v_mov_b32_e32 v0, s0
80+
; GCN-NEXT: s_nop 1
7681
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
7782
; GCN-NEXT: s_setpc_b64 s[30:31]
7883
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
8489
; GCN: ; %bb.0:
8590
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8691
; GCN-NEXT: v_mov_b32_e32 v1, s0
92+
; GCN-NEXT: s_nop 1
8793
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
8894
; GCN-NEXT: s_setpc_b64 s[30:31]
8995
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vi(i32 %vdst_old) {
2626
; GCN: ; %bb.0:
2727
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2828
; GCN-NEXT: v_mov_b32_e32 v1, 1
29+
; GCN-NEXT: s_nop 1
2930
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
3031
; GCN-NEXT: s_setpc_b64 s[30:31]
3132
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vl(i32 %vdst_old) {
3738
; GCN: ; %bb.0:
3839
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3940
; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1
41+
; GCN-NEXT: s_nop 1
4042
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
4143
; GCN-NEXT: s_setpc_b64 s[30:31]
4244
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane32_swap_b32_iv(i32 %src0_old) {
4951
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5052
; GCN-NEXT: v_mov_b32_e32 v1, v0
5153
; GCN-NEXT: v_mov_b32_e32 v0, 1
54+
; GCN-NEXT: s_nop 1
5255
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
5356
; GCN-NEXT: s_setpc_b64 s[30:31]
5457
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane32_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
6164
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6265
; GCN-NEXT: v_mov_b32_e32 v0, s0
6366
; GCN-NEXT: v_mov_b32_e32 v1, s1
67+
; GCN-NEXT: s_nop 1
6468
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
6569
; GCN-NEXT: s_setpc_b64 s[30:31]
6670
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane32_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
7377
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7478
; GCN-NEXT: v_mov_b32_e32 v1, v0
7579
; GCN-NEXT: v_mov_b32_e32 v0, s0
80+
; GCN-NEXT: s_nop 1
7681
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
7782
; GCN-NEXT: s_setpc_b64 s[30:31]
7883
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
8489
; GCN: ; %bb.0:
8590
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8691
; GCN-NEXT: v_mov_b32_e32 v1, s0
92+
; GCN-NEXT: s_nop 1
8793
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
8894
; GCN-NEXT: s_setpc_b64 s[30:31]
8995
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)

0 commit comments

Comments
 (0)