Skip to content

Commit b585a1b

Browse files
committed
[AMDGPU] Handle amdgpu.last.use metadata
Convert !amdgpu.last.use metadata into Machine Memory Operand for last use and handle it in SIMemoryLegalizer similary to nontemporal and volatile.
1 parent c66560c commit b585a1b

File tree

5 files changed

+259
-5
lines changed

5 files changed

+259
-5
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16181,9 +16181,12 @@ bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
1618116181
MachineMemOperand::Flags
1618216182
SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
1618316183
// Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16184+
MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
1618416185
if (I.getMetadata("amdgpu.noclobber"))
16185-
return MONoClobber;
16186-
return MachineMemOperand::MONone;
16186+
Flags |= MONoClobber;
16187+
if (I.getMetadata("amdgpu.last.use"))
16188+
Flags |= MOLastUse;
16189+
return Flags;
1618716190
}
1618816191

1618916192
bool SITargetLowering::checkForPhysRegDependency(

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ class SIMemOpInfo final {
9898
bool IsCrossAddressSpaceOrdering = false;
9999
bool IsVolatile = false;
100100
bool IsNonTemporal = false;
101+
bool IsLastUse = false;
101102

102103
SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103104
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
@@ -107,13 +108,15 @@ class SIMemOpInfo final {
107108
AtomicOrdering FailureOrdering =
108109
AtomicOrdering::SequentiallyConsistent,
109110
bool IsVolatile = false,
110-
bool IsNonTemporal = false)
111+
bool IsNonTemporal = false,
112+
bool IsLastUse = false)
111113
: Ordering(Ordering), FailureOrdering(FailureOrdering),
112114
Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113115
InstrAddrSpace(InstrAddrSpace),
114116
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115117
IsVolatile(IsVolatile),
116-
IsNonTemporal(IsNonTemporal) {
118+
IsNonTemporal(IsNonTemporal),
119+
IsLastUse(IsLastUse) {
117120

118121
if (Ordering == AtomicOrdering::NotAtomic) {
119122
assert(Scope == SIAtomicScope::NONE &&
@@ -201,6 +204,12 @@ class SIMemOpInfo final {
201204
return IsNonTemporal;
202205
}
203206

207+
/// \returns True if memory access of the machine instruction used to
208+
/// create this SIMemOpInfo is last use, false otherwise.
209+
bool isLastUse() const {
210+
return IsLastUse;
211+
}
212+
204213
/// \returns True if ordering constraint of the machine instruction used to
205214
/// create this SIMemOpInfo is unordered or higher, false otherwise.
206215
bool isAtomic() const {
@@ -316,6 +325,12 @@ class SICacheControl {
316325
return false;
317326
};
318327

328+
/// Update \p MI memory instruction to indicate it is a last use. Return true
329+
/// iff the instruction was modified.
330+
virtual bool enableLastUse(MachineInstr &MI, bool IsLastUse) const {
331+
return false;
332+
}
333+
319334
/// Inserts any necessary instructions at position \p Pos relative
320335
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
321336
/// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -592,6 +607,10 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
592607
// MI. \returns Returns true if \p MI is modified, false otherwise.
593608
bool setScope(const MachineBasicBlock::iterator MI,
594609
AMDGPU::CPol::CPol Value) const;
610+
// Checks if CPol operand is present in instruction \p MI and if current Scope
611+
// policy is same as \p Value.
612+
bool isScope(const MachineBasicBlock::iterator MI,
613+
AMDGPU::CPol::CPol Value) const;
595614

596615
// Stores with system scope (SCOPE_SYS) need to wait for:
597616
// - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
@@ -618,6 +637,9 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
618637
bool IsNonTemporal) const override;
619638

620639
bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
640+
641+
bool enableLastUse(MachineInstr &MI,
642+
bool IsLastUse) const override;
621643
};
622644

623645
class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -745,12 +767,14 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
745767
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
746768
bool IsNonTemporal = true;
747769
bool IsVolatile = false;
770+
bool IsLastUse = false;
748771

749772
// Validator should check whether or not MMOs cover the entire set of
750773
// locations accessed by the memory instruction.
751774
for (const auto &MMO : MI->memoperands()) {
752775
IsNonTemporal &= MMO->isNonTemporal();
753776
IsVolatile |= MMO->isVolatile();
777+
IsLastUse |= MMO->getFlags() & MOLastUse;
754778
InstrAddrSpace |=
755779
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
756780
AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
@@ -792,7 +816,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
792816
}
793817
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
794818
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
795-
IsNonTemporal);
819+
IsNonTemporal, IsLastUse);
796820
}
797821

798822
std::optional<SIMemOpInfo>
@@ -2209,6 +2233,15 @@ bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
22092233
return false;
22102234
}
22112235

2236+
bool SIGfx12CacheControl::isScope(const MachineBasicBlock::iterator MI,
2237+
AMDGPU::CPol::CPol Value) const {
2238+
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2239+
if (!CPol)
2240+
return false;
2241+
2242+
return (CPol->getImm() & AMDGPU::CPol::SCOPE) == Value;
2243+
}
2244+
22122245
bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
22132246
const MachineBasicBlock::iterator MI) const {
22142247
// TODO: implement flag for frontend to give us a hint not to insert waits.
@@ -2415,6 +2448,16 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
24152448
return Changed;
24162449
}
24172450

2451+
bool SIGfx12CacheControl::enableLastUse(MachineInstr &MI,
2452+
bool IsLastUse) const {
2453+
assert(MI.mayLoad() && !MI.mayStore());
2454+
2455+
if (IsLastUse && !isScope(MI, AMDGPU::CPol::SCOPE_SYS))
2456+
return setTH(MI, AMDGPU::CPol::TH_LU);;
2457+
2458+
return false;
2459+
}
2460+
24182461
bool SIGfx12CacheControl::expandSystemScopeStore(
24192462
MachineBasicBlock::iterator &MI) const {
24202463
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
@@ -2471,12 +2514,19 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
24712514
return Changed;
24722515
}
24732516

2517+
// enableVolatileAndOrNonTemporal can insert instructions and advance iterator
2518+
// MI and we need original instruction for enabling last use.
2519+
MachineInstr &Inst = *MI;
2520+
24742521
// Atomic instructions already bypass caches to the scope specified by the
24752522
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
24762523
// need additional treatment.
24772524
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
24782525
SIMemOp::LOAD, MOI.isVolatile(),
24792526
MOI.isNonTemporal());
2527+
2528+
Changed |= CC->enableLastUse(Inst, MOI.isLastUse());
2529+
24802530
return Changed;
24812531
}
24822532

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
4+
5+
define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
6+
; GFX12-LABEL: flat_last_use_load_0:
7+
; GFX12: ; %bb.0: ; %entry
8+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
9+
; GFX12-NEXT: s_wait_kmcnt 0x0
10+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
11+
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
12+
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
13+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
14+
; GFX12-NEXT: flat_store_b32 v[0:1], v2
15+
; GFX12-NEXT: s_endpgm
16+
entry:
17+
%val = load i32, ptr %in, align 4, !amdgpu.last.use !{}
18+
store i32 %val, ptr %out
19+
ret void
20+
}
21+
22+
define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
23+
; GFX12-LABEL: flat_last_use_load_1:
24+
; GFX12: ; %bb.0: ; %entry
25+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
26+
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
27+
; GFX12-NEXT: s_wait_kmcnt 0x0
28+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29+
; GFX12-NEXT: v_add_co_u32 v0, s0, s0, v0
30+
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
31+
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
32+
; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
33+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
34+
; GFX12-NEXT: flat_store_b32 v[0:1], v2
35+
; GFX12-NEXT: s_endpgm
36+
entry:
37+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
38+
%val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
39+
%val = load i32, ptr %val.gep, align 4, !amdgpu.last.use !{}
40+
store i32 %val, ptr %out
41+
ret void
42+
}
43+
44+
define amdgpu_kernel void @flat_last_use_volatile_load(ptr %in, ptr %out) {
45+
; GFX12-LABEL: flat_last_use_volatile_load:
46+
; GFX12: ; %bb.0: ; %entry
47+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
48+
; GFX12-NEXT: s_wait_kmcnt 0x0
49+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
50+
; GFX12-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
51+
; GFX12-NEXT: s_wait_loadcnt 0x0
52+
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
53+
; GFX12-NEXT: s_wait_dscnt 0x0
54+
; GFX12-NEXT: flat_store_b32 v[0:1], v2
55+
; GFX12-NEXT: s_endpgm
56+
entry:
57+
%val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{}
58+
store i32 %val, ptr %out
59+
ret void
60+
}
61+
62+
!0 = !{i32 1}
63+
declare i32 @llvm.amdgcn.workitem.id.x()
64+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
65+
; GFX12-CU: {{.*}}
66+
; GFX12-WGP: {{.*}}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
4+
5+
define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) {
6+
; GFX12-LABEL: global_last_use_load_0:
7+
; GFX12: ; %bb.0: ; %entry
8+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
9+
; GFX12-NEXT: s_wait_kmcnt 0x0
10+
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0
11+
; GFX12-NEXT: s_wait_kmcnt 0x0
12+
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
13+
; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
14+
; GFX12-NEXT: s_nop 0
15+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
16+
; GFX12-NEXT: s_endpgm
17+
entry:
18+
%val = load i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{}
19+
store i32 %val, ptr addrspace(1) %out
20+
ret void
21+
}
22+
23+
define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addrspace(1) %out) {
24+
; GFX12-LABEL: global_last_use_load_1:
25+
; GFX12: ; %bb.0: ; %entry
26+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
27+
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
28+
; GFX12-NEXT: s_wait_kmcnt 0x0
29+
; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] th:TH_LOAD_LU
30+
; GFX12-NEXT: s_wait_loadcnt 0x0
31+
; GFX12-NEXT: global_store_b32 v1, v0, s[2:3]
32+
; GFX12-NEXT: s_nop 0
33+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
34+
; GFX12-NEXT: s_endpgm
35+
entry:
36+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
37+
%val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
38+
%val = load i32, ptr addrspace(1) %val.gep, align 4, !amdgpu.last.use !{}
39+
store i32 %val, ptr addrspace(1) %out
40+
ret void
41+
}
42+
43+
define amdgpu_kernel void @global_last_use_volatile_load(ptr addrspace(1) %in, ptr addrspace(1) %out) {
44+
; GFX12-LABEL: global_last_use_volatile_load:
45+
; GFX12: ; %bb.0: ; %entry
46+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
47+
; GFX12-NEXT: v_mov_b32_e32 v0, 0
48+
; GFX12-NEXT: s_wait_kmcnt 0x0
49+
; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] scope:SCOPE_SYS
50+
; GFX12-NEXT: s_wait_loadcnt 0x0
51+
; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
52+
; GFX12-NEXT: s_nop 0
53+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
54+
; GFX12-NEXT: s_endpgm
55+
entry:
56+
%val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{}
57+
store i32 %val, ptr addrspace(1) %out
58+
ret void
59+
}
60+
61+
!0 = !{i32 1}
62+
declare i32 @llvm.amdgcn.workitem.id.x()
63+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
64+
; GFX12-CU: {{.*}}
65+
; GFX12-WGP: {{.*}}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
4+
5+
define amdgpu_kernel void @private_nontemporal_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) {
6+
; GFX12-LABEL: private_nontemporal_load_0:
7+
; GFX12: ; %bb.0: ; %entry
8+
; GFX12-NEXT: s_clause 0x1
9+
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
10+
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
11+
; GFX12-NEXT: v_mov_b32_e32 v1, 0
12+
; GFX12-NEXT: s_wait_kmcnt 0x0
13+
; GFX12-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_LU
14+
; GFX12-NEXT: s_wait_loadcnt 0x0
15+
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
16+
; GFX12-NEXT: s_nop 0
17+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
18+
; GFX12-NEXT: s_endpgm
19+
entry:
20+
%val = load i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{}
21+
store i32 %val, ptr addrspace(1) %out
22+
ret void
23+
}
24+
25+
define amdgpu_kernel void @private_nontemporal_load_1(ptr addrspace(5) %in, ptr addrspace(1) %out) {
26+
; GFX12-LABEL: private_nontemporal_load_1:
27+
; GFX12: ; %bb.0: ; %entry
28+
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
29+
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
30+
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
31+
; GFX12-NEXT: s_wait_kmcnt 0x0
32+
; GFX12-NEXT: scratch_load_b32 v0, v0, s2 th:TH_LOAD_LU
33+
; GFX12-NEXT: s_wait_loadcnt 0x0
34+
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
35+
; GFX12-NEXT: s_nop 0
36+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
37+
; GFX12-NEXT: s_endpgm
38+
entry:
39+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
40+
%val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid
41+
%val = load i32, ptr addrspace(5) %val.gep, align 4, !amdgpu.last.use !{}
42+
store i32 %val, ptr addrspace(1) %out
43+
ret void
44+
}
45+
46+
define amdgpu_kernel void @private_nontemporal_volatile_load(ptr addrspace(5) %in, ptr addrspace(1) %out) {
47+
; GFX12-LABEL: private_nontemporal_volatile_load:
48+
; GFX12: ; %bb.0: ; %entry
49+
; GFX12-NEXT: s_clause 0x1
50+
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
51+
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
52+
; GFX12-NEXT: v_mov_b32_e32 v1, 0
53+
; GFX12-NEXT: s_wait_kmcnt 0x0
54+
; GFX12-NEXT: scratch_load_b32 v0, off, s2 scope:SCOPE_SYS
55+
; GFX12-NEXT: s_wait_loadcnt 0x0
56+
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
57+
; GFX12-NEXT: s_nop 0
58+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
59+
; GFX12-NEXT: s_endpgm
60+
entry:
61+
%val = load volatile i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{}
62+
store i32 %val, ptr addrspace(1) %out
63+
ret void
64+
}
65+
66+
!0 = !{i32 1}
67+
declare i32 @llvm.amdgcn.workitem.id.x()
68+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
69+
; GFX12-CU: {{.*}}
70+
; GFX12-WGP: {{.*}}

0 commit comments

Comments
 (0)