Skip to content

Commit 924a64a

Browse files
authored
[AMDGPU] Only emit SCOPE_SYS global_wb (#110636)
global_wb with scopes lower than SCOPE_SYS is unnecessary for correctness. I was initially optimistic they would be very cheap no-ops but they can actually be quite expensive so let's avoid them.
1 parent 4ec06b1 commit 924a64a

38 files changed

+260
-1729
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 126 additions & 208 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2477,49 +2477,27 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
24772477
if (Pos == Position::AFTER)
24782478
++MI;
24792479

2480-
// GLOBAL_WB is always needed, even for write-through caches, as it
2481-
// additionally ensures all operations have reached the desired cache level.
2480+
// global_wb is only necessary at system scope for gfx120x targets.
24822481
//
2483-
// Note that we can technically skip emission of SCOPE_SE writebacks for
2484-
// gfx120x as L1 is a buffer there (hence forwards all to L2), but we still
2485-
// emit them. The current strategy we use is to favor mirrorring SW semantics
2486-
// in the ISA whenever it is correct, and the performance cost is very low.
2487-
//
2488-
// This makes the memory model easier to understand, maintain, and also
2489-
// reduces the potential for bugs as it is sometimes difficult to anticipate
2490-
// all possible scenarios in which the WB will actually be needed.
2491-
bool SkipWB = false;
2492-
AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2482+
// Emitting it for lower scopes is a slow no-op, so we omit it
2483+
// for performance.
24932484
switch (Scope) {
24942485
case SIAtomicScope::SYSTEM:
2495-
ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2486+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2487+
.addImm(AMDGPU::CPol::SCOPE_SYS);
24962488
break;
24972489
case SIAtomicScope::AGENT:
2498-
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2499-
break;
25002490
case SIAtomicScope::WORKGROUP:
2501-
// In WGP mode the waves of a work-group can be executing on either CU of
2502-
// the WGP. Therefore we need to ensure all operations have reached L1,
2503-
// hence the SCOPE_SE WB.
2504-
// For CU mode, we need operations to reach L0, so the wait is enough -
2505-
// there are no ways for an operation to report completion without reaching
2506-
// at least L0.
2507-
if (ST.isCuModeEnabled())
2508-
SkipWB = true;
2509-
else
2510-
ScopeImm = AMDGPU::CPol::SCOPE_SE;
2491+
// No WB necessary, but we still have to wait.
25112492
break;
25122493
case SIAtomicScope::WAVEFRONT:
25132494
case SIAtomicScope::SINGLETHREAD:
2514-
// No cache to invalidate.
2495+
// No WB or wait necessary here.
25152496
return false;
25162497
default:
25172498
llvm_unreachable("Unsupported synchronization scope");
25182499
}
25192500

2520-
if (!SkipWB)
2521-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm);
2522-
25232501
if (Pos == Position::AFTER)
25242502
--MI;
25252503

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) {
1818
; GFX12-NEXT: s_wait_samplecnt 0x0
1919
; GFX12-NEXT: s_wait_bvhcnt 0x0
2020
; GFX12-NEXT: s_wait_kmcnt 0x0
21-
; GFX12-NEXT: global_wb scope:SCOPE_SE
2221
; GFX12-NEXT: s_wait_storecnt 0x0
2322
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
2423
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -91,7 +90,6 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) {
9190
; GFX12-NEXT: s_wait_samplecnt 0x0
9291
; GFX12-NEXT: s_wait_bvhcnt 0x0
9392
; GFX12-NEXT: s_wait_kmcnt 0x0
94-
; GFX12-NEXT: global_wb scope:SCOPE_SE
9593
; GFX12-NEXT: s_wait_storecnt 0x0
9694
; GFX12-NEXT: ds_max_num_f32 v0, v1
9795
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -164,7 +162,6 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) {
164162
; GFX12-NEXT: s_wait_samplecnt 0x0
165163
; GFX12-NEXT: s_wait_bvhcnt 0x0
166164
; GFX12-NEXT: s_wait_kmcnt 0x0
167-
; GFX12-NEXT: global_wb scope:SCOPE_SE
168165
; GFX12-NEXT: s_wait_storecnt 0x0
169166
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
170167
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -241,7 +238,6 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) {
241238
; GFX12-NEXT: s_wait_samplecnt 0x0
242239
; GFX12-NEXT: s_wait_bvhcnt 0x0
243240
; GFX12-NEXT: s_wait_kmcnt 0x0
244-
; GFX12-NEXT: global_wb scope:SCOPE_SE
245241
; GFX12-NEXT: s_wait_storecnt 0x0
246242
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
247243
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -318,7 +314,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
318314
; GFX12-NEXT: s_wait_samplecnt 0x0
319315
; GFX12-NEXT: s_wait_bvhcnt 0x0
320316
; GFX12-NEXT: s_wait_kmcnt 0x0
321-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
322317
; GFX12-NEXT: s_wait_storecnt 0x0
323318
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
324319
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -465,7 +460,6 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
465460
; GFX12-NEXT: s_wait_samplecnt 0x0
466461
; GFX12-NEXT: s_wait_bvhcnt 0x0
467462
; GFX12-NEXT: s_wait_kmcnt 0x0
468-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
469463
; GFX12-NEXT: s_wait_storecnt 0x0
470464
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
471465
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -617,7 +611,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
617611
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
618612
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
619613
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
620-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
621614
; GFX12-NEXT: s_wait_storecnt 0x0
622615
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
623616
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -774,7 +767,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
774767
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
775768
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
776769
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
777-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
778770
; GFX12-NEXT: s_wait_storecnt 0x0
779771
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
780772
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -915,7 +907,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
915907
; GFX12-NEXT: s_wait_samplecnt 0x0
916908
; GFX12-NEXT: s_wait_bvhcnt 0x0
917909
; GFX12-NEXT: s_wait_kmcnt 0x0
918-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
919910
; GFX12-NEXT: s_wait_storecnt 0x0
920911
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
921912
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1058,7 +1049,6 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
10581049
; GFX12-NEXT: s_wait_samplecnt 0x0
10591050
; GFX12-NEXT: s_wait_bvhcnt 0x0
10601051
; GFX12-NEXT: s_wait_kmcnt 0x0
1061-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
10621052
; GFX12-NEXT: s_wait_storecnt 0x0
10631053
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
10641054
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1209,7 +1199,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
12091199
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12101200
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
12111201
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
1212-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
12131202
; GFX12-NEXT: s_wait_storecnt 0x0
12141203
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12151204
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1364,7 +1353,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
13641353
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
13651354
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
13661355
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
1367-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
13681356
; GFX12-NEXT: s_wait_storecnt 0x0
13691357
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13701358
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1507,7 +1495,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
15071495
; GFX12-NEXT: s_wait_bvhcnt 0x0
15081496
; GFX12-NEXT: s_wait_kmcnt 0x0
15091497
; GFX12-NEXT: v_mov_b32_e32 v1, s6
1510-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
15111498
; GFX12-NEXT: s_wait_storecnt 0x0
15121499
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
15131500
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1684,7 +1671,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
16841671
; GFX12-NEXT: s_wait_bvhcnt 0x0
16851672
; GFX12-NEXT: s_wait_kmcnt 0x0
16861673
; GFX12-NEXT: v_mov_b32_e32 v1, s6
1687-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
16881674
; GFX12-NEXT: s_wait_storecnt 0x0
16891675
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
16901676
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1865,11 +1851,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
18651851
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
18661852
; GFX12-NEXT: s_wait_loadcnt 0x0
18671853
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1868-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
18691854
; GFX12-NEXT: s_wait_storecnt 0x0
1870-
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
18711855
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1856+
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
18721857
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
1858+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
18731859
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
18741860
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
18751861
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2058,11 +2044,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
20582044
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
20592045
; GFX12-NEXT: s_wait_loadcnt 0x0
20602046
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
2061-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
20622047
; GFX12-NEXT: s_wait_storecnt 0x0
2048+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
20632049
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
20642050
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
2065-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
20662051
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
20672052
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
20682053
; GFX12-NEXT: s_wait_loadcnt 0x0

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) {
1818
; GFX12-NEXT: s_wait_samplecnt 0x0
1919
; GFX12-NEXT: s_wait_bvhcnt 0x0
2020
; GFX12-NEXT: s_wait_kmcnt 0x0
21-
; GFX12-NEXT: global_wb scope:SCOPE_SE
2221
; GFX12-NEXT: s_wait_storecnt 0x0
2322
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
2423
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -91,7 +90,6 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) {
9190
; GFX12-NEXT: s_wait_samplecnt 0x0
9291
; GFX12-NEXT: s_wait_bvhcnt 0x0
9392
; GFX12-NEXT: s_wait_kmcnt 0x0
94-
; GFX12-NEXT: global_wb scope:SCOPE_SE
9593
; GFX12-NEXT: s_wait_storecnt 0x0
9694
; GFX12-NEXT: ds_min_num_f32 v0, v1
9795
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -164,7 +162,6 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) {
164162
; GFX12-NEXT: s_wait_samplecnt 0x0
165163
; GFX12-NEXT: s_wait_bvhcnt 0x0
166164
; GFX12-NEXT: s_wait_kmcnt 0x0
167-
; GFX12-NEXT: global_wb scope:SCOPE_SE
168165
; GFX12-NEXT: s_wait_storecnt 0x0
169166
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
170167
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -241,7 +238,6 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) {
241238
; GFX12-NEXT: s_wait_samplecnt 0x0
242239
; GFX12-NEXT: s_wait_bvhcnt 0x0
243240
; GFX12-NEXT: s_wait_kmcnt 0x0
244-
; GFX12-NEXT: global_wb scope:SCOPE_SE
245241
; GFX12-NEXT: s_wait_storecnt 0x0
246242
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
247243
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -318,7 +314,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
318314
; GFX12-NEXT: s_wait_samplecnt 0x0
319315
; GFX12-NEXT: s_wait_bvhcnt 0x0
320316
; GFX12-NEXT: s_wait_kmcnt 0x0
321-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
322317
; GFX12-NEXT: s_wait_storecnt 0x0
323318
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
324319
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -465,7 +460,6 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
465460
; GFX12-NEXT: s_wait_samplecnt 0x0
466461
; GFX12-NEXT: s_wait_bvhcnt 0x0
467462
; GFX12-NEXT: s_wait_kmcnt 0x0
468-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
469463
; GFX12-NEXT: s_wait_storecnt 0x0
470464
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
471465
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -617,7 +611,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
617611
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
618612
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
619613
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
620-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
621614
; GFX12-NEXT: s_wait_storecnt 0x0
622615
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
623616
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -774,7 +767,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
774767
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
775768
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
776769
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
777-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
778770
; GFX12-NEXT: s_wait_storecnt 0x0
779771
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
780772
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -915,7 +907,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
915907
; GFX12-NEXT: s_wait_samplecnt 0x0
916908
; GFX12-NEXT: s_wait_bvhcnt 0x0
917909
; GFX12-NEXT: s_wait_kmcnt 0x0
918-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
919910
; GFX12-NEXT: s_wait_storecnt 0x0
920911
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
921912
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1058,7 +1049,6 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
10581049
; GFX12-NEXT: s_wait_samplecnt 0x0
10591050
; GFX12-NEXT: s_wait_bvhcnt 0x0
10601051
; GFX12-NEXT: s_wait_kmcnt 0x0
1061-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
10621052
; GFX12-NEXT: s_wait_storecnt 0x0
10631053
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
10641054
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1209,7 +1199,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
12091199
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12101200
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
12111201
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
1212-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
12131202
; GFX12-NEXT: s_wait_storecnt 0x0
12141203
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12151204
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1364,7 +1353,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
13641353
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
13651354
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
13661355
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
1367-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
13681356
; GFX12-NEXT: s_wait_storecnt 0x0
13691357
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13701358
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1507,7 +1495,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
15071495
; GFX12-NEXT: s_wait_bvhcnt 0x0
15081496
; GFX12-NEXT: s_wait_kmcnt 0x0
15091497
; GFX12-NEXT: v_mov_b32_e32 v1, s6
1510-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
15111498
; GFX12-NEXT: s_wait_storecnt 0x0
15121499
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
15131500
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1684,7 +1671,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
16841671
; GFX12-NEXT: s_wait_bvhcnt 0x0
16851672
; GFX12-NEXT: s_wait_kmcnt 0x0
16861673
; GFX12-NEXT: v_mov_b32_e32 v1, s6
1687-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
16881674
; GFX12-NEXT: s_wait_storecnt 0x0
16891675
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
16901676
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1865,11 +1851,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
18651851
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
18661852
; GFX12-NEXT: s_wait_loadcnt 0x0
18671853
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1868-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
18691854
; GFX12-NEXT: s_wait_storecnt 0x0
1870-
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
18711855
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1856+
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
18721857
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
1858+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
18731859
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
18741860
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
18751861
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2058,11 +2044,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
20582044
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
20592045
; GFX12-NEXT: s_wait_loadcnt 0x0
20602046
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
2061-
; GFX12-NEXT: global_wb scope:SCOPE_DEV
20622047
; GFX12-NEXT: s_wait_storecnt 0x0
2048+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
20632049
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
20642050
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
2065-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
20662051
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
20672052
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
20682053
; GFX12-NEXT: s_wait_loadcnt 0x0

0 commit comments

Comments
 (0)