Skip to content

[AMDGPU] Enhance s_waitcnt insertion before barrier for gfx12 #90595

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1857,7 +1857,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// not, we need to ensure the subtarget is capable of backing off barrier
// instructions in case there are any outstanding memory operations that may
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
if (TII->isBarrierStart(MI.getOpcode()) &&
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
}
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -925,6 +925,17 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
}

// Check to see if opcode is for a barrier start. Pre gfx12 this is just the
// S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want
// to check for the barrier start (S_BARRIER_SIGNAL*)
bool isBarrierStart(unsigned Opcode) const {
return Opcode == AMDGPU::S_BARRIER ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
}

static bool doesNotReadTiedSource(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead;
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT4-NEXT: s_wait_kmcnt 0x0
; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2
; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1]
; VARIANT4-NEXT: s_wait_storecnt 0x0
; VARIANT4-NEXT: s_barrier_signal -1
; VARIANT4-NEXT: s_barrier_wait -1
; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1
Expand Down Expand Up @@ -142,6 +143,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0
; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1]
; VARIANT6-NEXT: s_wait_storecnt 0x0
; VARIANT6-NEXT: s_barrier_signal -1
; VARIANT6-NEXT: s_barrier_wait -1
; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
Expand Down
22 changes: 22 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal -1
; GCN-NEXT: s_barrier_wait -1
; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
Expand All @@ -28,6 +29,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal -1
; GLOBAL-ISEL-NEXT: s_barrier_wait -1
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
Expand Down Expand Up @@ -56,6 +58,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal 1
; GCN-NEXT: s_barrier_wait 1
; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
Expand All @@ -72,6 +75,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal 1
; GLOBAL-ISEL-NEXT: s_barrier_wait 1
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
Expand Down Expand Up @@ -100,6 +104,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal 0
; GCN-NEXT: s_barrier_wait 0
; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
Expand All @@ -116,6 +121,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal 0
; GLOBAL-ISEL-NEXT: s_barrier_wait 0
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
Expand Down Expand Up @@ -146,6 +152,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v3, v1, s[0:1]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal m0
; GCN-NEXT: s_barrier_wait 1
; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
Expand All @@ -163,6 +170,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal m0
; GLOBAL-ISEL-NEXT: s_barrier_wait 1
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
Expand Down Expand Up @@ -192,6 +200,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal m0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -203,6 +212,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal m0
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg)
Expand All @@ -216,6 +226,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst -1
; GCN-NEXT: s_cselect_b32 s3, s3, s5
; GCN-NEXT: s_cselect_b32 s2, s2, s4
Expand All @@ -235,6 +246,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst -1
; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
Expand Down Expand Up @@ -270,6 +282,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst 1
; GCN-NEXT: s_cselect_b32 s3, s3, s5
; GCN-NEXT: s_cselect_b32 s2, s2, s4
Expand All @@ -289,6 +302,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
Expand Down Expand Up @@ -324,6 +338,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst 1
; GCN-NEXT: s_cselect_b32 s3, s3, s5
; GCN-NEXT: s_cselect_b32 s2, s2, s4
Expand All @@ -343,6 +358,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
Expand Down Expand Up @@ -379,6 +395,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
; GCN-NEXT: s_mov_b32 m0, 1
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst m0
; GCN-NEXT: s_cselect_b32 s3, s3, s5
; GCN-NEXT: s_cselect_b32 s2, s2, s4
Expand All @@ -399,6 +416,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
Expand Down Expand Up @@ -444,6 +462,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
; GCN-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9
; GCN-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
; GCN-NEXT: global_store_b32 v[7:8], v10, off
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst m0
; GCN-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
Expand All @@ -470,6 +489,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
; GLOBAL-ISEL-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v9, 0
; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v9, off
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
Expand Down Expand Up @@ -1339,6 +1359,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal -1
; GCN-NEXT: s_barrier_wait -1
; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
Expand All @@ -1355,6 +1376,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal -1
; GLOBAL-ISEL-NEXT: s_barrier_wait -1
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
Expand Down
Loading