Skip to content

Commit f2314c9

Browse files
committed
[WIP][AMDGPU][Attributor] Make AAAMDFlatWorkGroupSize honor existing attribute
If a function has `amdgpu-flat-work-group-size`, honor it in `initialize` by taking its value directly, set it to known range, indicate a pessimistic fixed point such that the known range is propagated to the assumed range; otherwise, it simply does nothing. We will no longer clamp (real clamp, instead of the union one in `IntegerRangeState`) the known range, which can cause issues because the known range is a "throttle" to the assumed range such that the assumed range can't get widened properly in `updateImpl`. Another benefit of not touching the known range in `initialize` is, if we indicate pessimistic state in `updateImpl`, it is also invalid, such that `manifest` will not be called. Since we honor the attribute, we don't want any half-baked attribute added to a function.
1 parent 9234ae1 commit f2314c9

25 files changed

+309
-255
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -168,11 +168,21 @@ class AMDGPUInformationCache : public InformationCache {
168168
return ST.supportsGetDoorbellID();
169169
}
170170

171-
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
171+
std::optional<std::pair<unsigned, unsigned>>
172+
getFlatWorkGroupSizeAttr(const Function &F) const {
173+
Attribute A = F.getFnAttribute("amdgpu-flat-work-group-size");
174+
if (!A.isStringAttribute())
175+
return std::nullopt;
172176
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
173177
return ST.getFlatWorkGroupSizes(F);
174178
}
175179

180+
std::pair<unsigned, unsigned>
181+
getDefaultFlatWorkGroupSize(const Function &F) const {
182+
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
183+
return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
184+
}
185+
176186
std::pair<unsigned, unsigned>
177187
getMaximumFlatWorkGroupRange(const Function &F) {
178188
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
@@ -733,6 +743,35 @@ struct AAAMDSizeRangeAttribute
733743
return Change;
734744
}
735745

746+
/// Clamp the assumed range to the default value ([Min, Max]) and emit the
747+
/// attribute if it is not same as default.
748+
ChangeStatus
749+
emitAttributeIfNotDefaultAfterClamp(Attributor &A,
750+
std::pair<unsigned, unsigned> Default) {
751+
auto [Min, Max] = Default;
752+
unsigned Lower = getAssumed().getLower().getZExtValue();
753+
unsigned Upper = getAssumed().getUpper().getZExtValue();
754+
755+
// Clamp the range to the default value.
756+
if (Lower < Min)
757+
Lower = Min;
758+
if (Upper > Max + 1)
759+
Upper = Max + 1;
760+
761+
// No manifest if the value is invalid or same as default after clamp.
762+
if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
763+
return ChangeStatus::UNCHANGED;
764+
765+
Function *F = getAssociatedFunction();
766+
LLVMContext &Ctx = F->getContext();
767+
SmallString<10> Buffer;
768+
raw_svector_ostream OS(Buffer);
769+
OS << Lower << ',' << Upper - 1;
770+
return A.manifestAttrs(getIRPosition(),
771+
{Attribute::get(Ctx, AttrName, OS.str())},
772+
/*ForceReplace=*/true);
773+
}
774+
736775
ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
737776
unsigned Max) {
738777
// Don't add the attribute if it's the implied default.
@@ -767,13 +806,21 @@ struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
767806
void initialize(Attributor &A) override {
768807
Function *F = getAssociatedFunction();
769808
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
770-
unsigned MinGroupSize, MaxGroupSize;
771-
std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
772-
intersectKnown(
773-
ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
774809

775-
if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
776-
indicatePessimisticFixpoint();
810+
bool HasAttr = false;
811+
auto [Min, Max] = InfoCache.getDefaultFlatWorkGroupSize(*F);
812+
813+
if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
814+
std::tie(Min, Max) = *Attr;
815+
HasAttr = true;
816+
}
817+
818+
ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
819+
IntegerRangeState RangeState(Range);
820+
clampStateAndIndicateChange(this->getState(), RangeState);
821+
822+
if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
823+
indicateOptimisticFixpoint();
777824
}
778825

779826
ChangeStatus updateImpl(Attributor &A) override {
@@ -787,9 +834,8 @@ struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
787834
ChangeStatus manifest(Attributor &A) override {
788835
Function *F = getAssociatedFunction();
789836
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
790-
unsigned Min, Max;
791-
std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
792-
return emitAttributeIfNotDefault(A, Min, Max);
837+
auto [Min, Max] = InfoCache.getMaximumFlatWorkGroupRange(*F);
838+
return emitAttributeIfNotDefaultAfterClamp(A, {Min, Max});
793839
}
794840

795841
/// See AbstractAttribute::getName()

llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ define ptr addrspace(3) @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 {
217217
; AKF_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3))
218218
;
219219
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group
220-
; ATTRIBUTOR_HSA-SAME: () #[[ATTR3:[0-9]+]] {
220+
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
221221
; ATTRIBUTOR_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3))
222222
;
223223
ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3))
@@ -235,7 +235,6 @@ attributes #1 = { nounwind }
235235
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
236236
; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
237237
; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
238-
; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
239238
;.
240239
; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
241240
;.

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() {
7373

7474
define void @func_uses_asm_virtreg_agpr() {
7575
; CHECK-LABEL: define void @func_uses_asm_virtreg_agpr(
76-
; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
76+
; CHECK-SAME: ) #[[ATTR0]] {
7777
; CHECK-NEXT: call void asm sideeffect "
7878
; CHECK-NEXT: ret void
7979
;
@@ -83,7 +83,7 @@ define void @func_uses_asm_virtreg_agpr() {
8383

8484
define void @func_uses_asm_physreg_agpr() {
8585
; CHECK-LABEL: define void @func_uses_asm_physreg_agpr(
86-
; CHECK-SAME: ) #[[ATTR2]] {
86+
; CHECK-SAME: ) #[[ATTR0]] {
8787
; CHECK-NEXT: call void asm sideeffect "
8888
; CHECK-NEXT: ret void
8989
;
@@ -93,7 +93,7 @@ define void @func_uses_asm_physreg_agpr() {
9393

9494
define void @func_uses_asm_physreg_agpr_tuple() {
9595
; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple(
96-
; CHECK-SAME: ) #[[ATTR2]] {
96+
; CHECK-SAME: ) #[[ATTR0]] {
9797
; CHECK-NEXT: call void asm sideeffect "
9898
; CHECK-NEXT: ret void
9999
;
@@ -105,7 +105,7 @@ declare void @unknown()
105105

106106
define amdgpu_kernel void @kernel_calls_extern() {
107107
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
108-
; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
108+
; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
109109
; CHECK-NEXT: call void @unknown()
110110
; CHECK-NEXT: ret void
111111
;
@@ -115,8 +115,8 @@ define amdgpu_kernel void @kernel_calls_extern() {
115115

116116
define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
117117
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
118-
; CHECK-SAME: ) #[[ATTR4]] {
119-
; CHECK-NEXT: call void @unknown() #[[ATTR9:[0-9]+]]
118+
; CHECK-SAME: ) #[[ATTR2]] {
119+
; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]]
120120
; CHECK-NEXT: ret void
121121
;
122122
call void @unknown() #0
@@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
125125

126126
define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
127127
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
128-
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
128+
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
129129
; CHECK-NEXT: call void [[INDIRECT]]()
130130
; CHECK-NEXT: ret void
131131
;
@@ -135,8 +135,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
135135

136136
define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
137137
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
138-
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
139-
; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR9]]
138+
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
139+
; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR6]]
140140
; CHECK-NEXT: ret void
141141
;
142142
call void %indirect() #0
@@ -155,15 +155,15 @@ define amdgpu_kernel void @kernel_transitively_uses_agpr_asm() {
155155

156156
define void @empty() {
157157
; CHECK-LABEL: define void @empty(
158-
; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
158+
; CHECK-SAME: ) #[[ATTR1]] {
159159
; CHECK-NEXT: ret void
160160
;
161161
ret void
162162
}
163163

164164
define void @also_empty() {
165165
; CHECK-LABEL: define void @also_empty(
166-
; CHECK-SAME: ) #[[ATTR5]] {
166+
; CHECK-SAME: ) #[[ATTR1]] {
167167
; CHECK-NEXT: ret void
168168
;
169169
ret void
@@ -256,12 +256,9 @@ attributes #0 = { "amdgpu-no-agpr" }
256256
;.
257257
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
258258
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
259-
; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
260-
; CHECK: attributes #[[ATTR3:[0-9]+]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
261-
; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
262-
; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
263-
; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
264-
; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
265-
; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
266-
; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" }
259+
; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
260+
; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
261+
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
262+
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
263+
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-agpr" }
267264
;.

llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,14 @@ define void @call_no_dispatch_id() {
117117
ret void
118118
}
119119
;.
120-
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
121-
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
122-
; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
123-
; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
124-
; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
125-
; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
126-
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
127-
; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
128-
; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
129-
; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
120+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
121+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
122+
; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
123+
; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "uniform-work-group-size"="false" }
124+
; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "uniform-work-group-size"="false" }
125+
; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "uniform-work-group-size"="false" }
126+
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "uniform-work-group-size"="false" }
127+
; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "uniform-work-group-size"="false" }
128+
; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
129+
; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
130130
;.

0 commit comments

Comments
 (0)