-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[AMDGPU] Make getAssumedAddrSpace
return AS1 for pointer kernel arguments
#137488
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU] Make getAssumedAddrSpace
return AS1 for pointer kernel arguments
#137488
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
getAssumedAddrSpace
return AS1 for pointer kernel argumentsgetAssumedAddrSpace
return AS1 for pointer kernel arguments
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesPatch is 1.04 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137488.diff 37 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index eb50617b281a1..150060e1b266c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -951,6 +951,10 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
}
unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
+ if (auto *Arg = dyn_cast<Argument>(V);
+ Arg && AMDGPU::isKernelCC(Arg->getParent()))
+ return AMDGPUAS::GLOBAL_ADDRESS;
+
const auto *LD = dyn_cast<LoadInst>(V);
if (!LD) // TODO: Handle invariant load like constant.
return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 1032dede7cb36..b6ad73a6acc94 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12597,29 +12597,18 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
}
ChangeStatus updateImpl(Attributor &A) override {
- unsigned FlatAS = A.getInfoCache().getFlatAddressSpace().value();
uint32_t OldAddressSpace = AssumedAddressSpace;
auto CheckAddressSpace = [&](Value &Obj) {
if (isa<UndefValue>(&Obj))
return true;
- // If an argument in flat address space only has addrspace cast uses, and
- // those casts are same, then we take the dst addrspace.
if (auto *Arg = dyn_cast<Argument>(&Obj)) {
- if (Arg->getType()->getPointerAddressSpace() == FlatAS) {
- unsigned CastAddrSpace = FlatAS;
- for (auto *U : Arg->users()) {
- auto *ASCI = dyn_cast<AddrSpaceCastInst>(U);
- if (!ASCI)
- return takeAddressSpace(Obj.getType()->getPointerAddressSpace());
- if (CastAddrSpace != FlatAS &&
- CastAddrSpace != ASCI->getDestAddressSpace())
- return false;
- CastAddrSpace = ASCI->getDestAddressSpace();
- }
- if (CastAddrSpace != FlatAS)
- return takeAddressSpace(CastAddrSpace);
- }
+ auto *TTI =
+ A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(
+ *Arg->getParent());
+ unsigned AssumedAS = TTI->getAssumedAddrSpace(Arg);
+ if (AssumedAS != ~0U)
+ return takeAddressSpace(AssumedAS);
}
return takeAddressSpace(Obj.getType()->getPointerAddressSpace());
};
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index aeb301939e986..0347d9e5c3110 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -946,7 +946,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -964,7 +964,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -974,52 +974,38 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_dec_ret_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_dec_ret_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
store i32 %result, ptr %out, align 4
@@ -1040,7 +1026,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1060,7 +1046,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1070,54 +1056,38 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s2, s2, 16
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
%result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -1139,7 +1109,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1159,7 +1129,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1169,54 +1139,38 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s2, s2, 16
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
%result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
@@ -1236,7 +1190,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
@@ -1251,37 +1205,28 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_atomic_dec v[0:1], v2
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_dec_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_atomic_dec v[0:1], v2
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_dec_noret_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v2, 42
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_atomic_dec v[0:1], v2
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1290,11 +1235,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; GFX11-LABEL: flat_atomic_dec_noret_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1317,7 +1260,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
@@ -1334,39 +1277,28 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_atomic_dec v[0:1], v2
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v2, 42
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 16
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_atomic_dec v[0:1], v2
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1375,11 +1307,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; GFX11-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1403,7 +1333,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we delete the AMDGPUPromoteKernelArguments pass after this (but really clang should be fixed, we shouldn't have to do this)
if (auto *Arg = dyn_cast<Argument>(V); | ||
Arg && AMDGPU::isKernelCC(Arg->getParent())) | ||
return AMDGPUAS::GLOBAL_ADDRESS; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably need to defend against byref values. Ideally we would only use byref, in which case this needs to look at the load from the argument. But then if we're changing the IR calling convention lowering we might as well fix it to emit addrspace(1) in the first place
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we have something like ptr byref(i32) %in.byref
, I suppose it is also in AS1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or 4, yes. But if we're fixing clang to do the right thing, shouldn't need to handle the flat case
Make preparation for #137488.
Yes, that's the plan.
And this is also in the TODO list. |
Make preparation for #137488.
97eaa7b
to
bfda0fe
Compare
getAssumedAddrSpace
return AS1 for pointer kernel argumentsgetAssumedAddrSpace
return AS1 for pointer kernel arguments
bfda0fe
to
8e7a023
Compare
No description provided.