Skip to content

[AMDGPU] Handle amdgpu.last.use metadata #83816

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1299,6 +1299,21 @@ The AMDGPU backend implements the following LLVM IR intrinsics.

List AMDGPU intrinsics.

LLVM IR Metadata
------------------

The AMDGPU backend implements the following LLVM IR metadata.

.. table:: AMDGPU LLVM IR Metadata
:name: amdgpu-llvm-ir-metadata-table

============================================== ==========================================================
LLVM IR Metadata Description
============================================== ==========================================================
!amdgpu.last.use Sets TH_LOAD_LU temporal hint on load instructions that support it.
Takes priority over nontemporal hint (TH_LOAD_NT).
============================================== ==========================================================

LLVM IR Attributes
------------------

Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16177,9 +16177,12 @@ bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
MachineMemOperand::Flags
SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
// Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
if (I.getMetadata("amdgpu.noclobber"))
return MONoClobber;
return MachineMemOperand::MONone;
Flags |= MONoClobber;
if (I.getMetadata("amdgpu.last.use"))
Flags |= MOLastUse;
return Flags;
}

bool SITargetLowering::checkForPhysRegDependency(
Expand Down
99 changes: 55 additions & 44 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,22 +98,22 @@ class SIMemOpInfo final {
bool IsCrossAddressSpaceOrdering = false;
bool IsVolatile = false;
bool IsNonTemporal = false;

SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
bool IsCrossAddressSpaceOrdering = true,
AtomicOrdering FailureOrdering =
AtomicOrdering::SequentiallyConsistent,
bool IsVolatile = false,
bool IsNonTemporal = false)
: Ordering(Ordering), FailureOrdering(FailureOrdering),
Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
InstrAddrSpace(InstrAddrSpace),
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile),
IsNonTemporal(IsNonTemporal) {
bool IsLastUse = false;

SIMemOpInfo(
AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
bool IsCrossAddressSpaceOrdering = true,
AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
bool IsVolatile = false, bool IsNonTemporal = false,
bool IsLastUse = false)
: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
IsLastUse(IsLastUse) {

if (Ordering == AtomicOrdering::NotAtomic) {
assert(Scope == SIAtomicScope::NONE &&
Expand Down Expand Up @@ -201,6 +201,10 @@ class SIMemOpInfo final {
return IsNonTemporal;
}

/// \returns True if memory access of the machine instruction used to
/// create this SIMemOpInfo is last use, false otherwise.
bool isLastUse() const { return IsLastUse; }

/// \returns True if ordering constraint of the machine instruction used to
/// create this SIMemOpInfo is unordered or higher, false otherwise.
bool isAtomic() const {
Expand Down Expand Up @@ -305,12 +309,13 @@ class SICacheControl {
SIAtomicAddrSpace AddrSpace) const = 0;

/// Update \p MI memory instruction of kind \p Op associated with address
/// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
/// true iff the instruction was modified.
/// spaces \p AddrSpace to indicate it is volatile and/or
/// nontemporal/last-use. Return true iff the instruction was modified.
virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace,
SIMemOp Op, bool IsVolatile,
bool IsNonTemporal) const = 0;
bool IsNonTemporal,
bool IsLastUse = false) const = 0;

virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
return false;
Expand Down Expand Up @@ -394,8 +399,8 @@ class SIGfx6CacheControl : public SICacheControl {

bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;

bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
Expand Down Expand Up @@ -447,8 +452,8 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {

bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;

bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
Expand Down Expand Up @@ -508,8 +513,8 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl {

bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;

bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
Expand Down Expand Up @@ -552,8 +557,8 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {

bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;

bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
Expand All @@ -578,8 +583,8 @@ class SIGfx11CacheControl : public SIGfx10CacheControl {

bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;
};

class SIGfx12CacheControl : public SIGfx11CacheControl {
Expand Down Expand Up @@ -614,8 +619,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {

bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;

bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
};
Expand Down Expand Up @@ -745,12 +750,14 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
bool IsNonTemporal = true;
bool IsVolatile = false;
bool IsLastUse = false;

// Validator should check whether or not MMOs cover the entire set of
// locations accessed by the memory instruction.
for (const auto &MMO : MI->memoperands()) {
IsNonTemporal &= MMO->isNonTemporal();
IsVolatile |= MMO->isVolatile();
IsLastUse |= MMO->getFlags() & MOLastUse;
InstrAddrSpace |=
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
Expand Down Expand Up @@ -792,7 +799,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
IsNonTemporal);
IsNonTemporal, IsLastUse);
}

std::optional<SIMemOpInfo>
Expand Down Expand Up @@ -969,7 +976,7 @@ bool SIGfx6CacheControl::enableRMWCacheBypass(

bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
Expand Down Expand Up @@ -1322,7 +1329,7 @@ bool SIGfx90ACacheControl::enableRMWCacheBypass(

bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
Expand Down Expand Up @@ -1624,7 +1631,7 @@ bool SIGfx940CacheControl::enableRMWCacheBypass(

bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
Expand Down Expand Up @@ -1856,7 +1863,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(

bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {

// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
Expand Down Expand Up @@ -2127,7 +2134,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass(

bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {

// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
Expand Down Expand Up @@ -2379,7 +2386,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {

// Only handle load and store, not atomic read-modify-write instructions.
assert(MI->mayLoad() ^ MI->mayStore());
Expand All @@ -2392,7 +2399,10 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(

bool Changed = false;

if (IsNonTemporal) {
if (IsLastUse) {
// Set last-use hint.
Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
} else if (IsNonTemporal) {
// Set non-temporal hint for all cache levels.
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
}
Expand Down Expand Up @@ -2472,11 +2482,12 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
}

// Atomic instructions already bypass caches to the scope specified by the
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
// need additional treatment.
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
SIMemOp::LOAD, MOI.isVolatile(),
MOI.isNonTemporal());
// SyncScope operand. Only non-atomic volatile and nontemporal/last-use
// instructions need additional treatment.
Changed |= CC->enableVolatileAndOrNonTemporal(
MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
MOI.isNonTemporal(), MOI.isLastUse());

return Changed;
}

Expand Down
83 changes: 83 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s

define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_load_0:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
entry:
%val = load i32, ptr %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr %out
ret void
}

define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_load_1:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
%val = load i32, ptr %val.gep, align 4, !amdgpu.last.use !{}
store i32 %val, ptr %out
ret void
}

define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_and_volatile_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr %out
ret void
}

define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_and_nontemporal_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
entry:
%val = load i32, ptr %in, align 4, !amdgpu.last.use !{}, !nontemporal !0
store i32 %val, ptr %out
ret void
}

!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX12-CU: {{.*}}
; GFX12-WGP: {{.*}}
Loading